In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os # im,porting os

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">Importing Necessary Packages </h1>

In [None]:

# text processing libraries
import re
import string
import nltk
from nltk.corpus import stopwords

#Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
style.use('ggplot')

# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier

# sklearn 
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

# File system manangement
import os


from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score
from mlxtend.plotting import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#importing tensorflow libraries
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">Importing Data</h1>

In [None]:
train=pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test=pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
display(train.info())

In [None]:
display(test.info())

In [None]:
#!pip install pycomp

In [None]:
from pycomp.viz.insights import *
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style>
""")

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">Exploratory Text-Data Analysis</h1>

In [None]:
display(train['target'].value_counts())
plt.figure(figsize=(8,6))
carrier_count = train["target"].value_counts()
sns.set(style="darkgrid")
sns.barplot(carrier_count.index, carrier_count.values, alpha=1,edgecolor='k',palette='rocket')
plt.title('Frequency Distribution of target')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('target', fontsize=12)
plt.xticks((0,1),('Fake', 'Real'))
plt.show()

In [None]:
mapping = {1: 'Real', 0: 'Fake'}
plot_donut_chart(df=train, col='target', label_names=mapping, colors=["#ff7f51","#ff9b54"],
                 title='Target Value Distribution')

In [None]:
import plotly.graph_objects as go
train['length'] = train['text'].apply(len)
data = [go.Box(y=train[train['target']==0]['length'],name='Fake'),
        go.Box(y=train[train['target']==1]['length'],name='Real')]
layout = go.Layout(title = 'Comparison of text length in Tweets')
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
train.describe()

In [None]:
train=train.drop(['id','keyword','location','length'],axis=1,inplace=False)

In [None]:
train_feature=train.drop('target',axis=1)
train_target=train.target

In [None]:
test_feature=test.drop(['id','keyword','location'],axis=1,inplace=False)

In [None]:
display(train_feature.isna().sum())
display(train_target.isna().sum())

In [None]:
display(train_feature.shape,train_feature.dtypes)
display(train_target.shape,train_target.dtypes)

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">Text Preprocessing</h1>

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower() # Convert to lower
    text = re.sub('\[.*?\]', '', text) #remove texts in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text)#remove links
    text = re.sub('<.*?>+', '', text)#remove special characters
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)#remove punctuation
    text = re.sub('\n', '', text)#remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    return text

# Applying the cleaning function to both test and training datasets
train_feature['text'] = train_feature['text'].apply(lambda x: clean_text(x))
test_feature['text'] = test_feature['text'].apply(lambda x: clean_text(x))

# Let's take a look at the updated text
train_feature['text'].head()

In [None]:
# Tokenizing the training and the test set
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
train_feature['text'] = train_feature['text'].apply(lambda x: tokenizer.tokenize(x))
test_feature['text'] = test_feature['text'].apply(lambda x: tokenizer.tokenize(x))
train_feature['text'].head()

In [None]:
def remove_stopwords(text):
    """Removing stopwords belonging to english language
    """
    words = [w for w in text if w not in stopwords.words('english')]
    return words
train_feature['text'] = train_feature['text'].apply(lambda x : remove_stopwords(x))
test_feature['text'] = test_feature['text'].apply(lambda x : remove_stopwords(x))
train_feature.head()

In [None]:
# After preprocessing, the text format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

train_feature['text'] = train_feature['text'].apply(lambda x : combine_text(x))
test_feature['text'] = test_feature['text'].apply(lambda x : combine_text(x))
train_feature['text']
train_feature.head()

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">Uni-gram,Bi-gram,Tri-Gram for Train Set</h1>

In [None]:
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_words(train_feature['text'], 30)
df2 = pd.DataFrame (common_words,columns=['word','count'])
df2.groupby('word').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', 
                        linecolor='black',title='Top 30 unigrams used in Tweets',color='#48cae4')

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(train_feature['text'], 30)
df3 = pd.DataFrame(common_words, columns = ['words' ,'count'])
df3.groupby('words').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 30 bigrams used in Tweets', color='#720026')

In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_trigram(train_feature['text'], 30)
df3 = pd.DataFrame(common_words, columns = ['words' ,'count'])
df3.groupby('words').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 30 bigrams used in Tweets', color='#e9b827')

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">Uni-gram,Bi-gram,Tri-Gram for Test Set</h1>

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_words(test_feature['text'], 30)
df2 = pd.DataFrame (common_words,columns=['word','count'])
df2.groupby('word').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', 
                        linecolor='black',title='Top 30 unigrams used in Tweets',color='#48cae4')

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(test_feature['text'], 30)
df3 = pd.DataFrame(common_words, columns = ['words' ,'count'])
df3.groupby('words').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 30 bigrams used in Tweets', color='#720026')

In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_trigram(test_feature['text'], 30)
df3 = pd.DataFrame(common_words, columns = ['words' ,'count'])
df3.groupby('words').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 30 bigrams used in Tweets', color='#e9b827')

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">WordCloud Train Set</h1>

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(16,8))
wc = WordCloud(background_color="black", max_words=150,max_font_size=150,random_state=42)
wc.generate(' '.join(train_feature['text']))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">WordCloud Test Set</h1>

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(16,8))
wc = WordCloud(background_color="black", max_words=150,max_font_size=150,random_state=42)
wc.generate(' '.join(test_feature['text']))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">Tf-idf & Multinomial Naive-Bayes</h1>

In [None]:
tf_idf= TfidfVectorizer()
X=tf_idf.fit_transform(train_feature.text)
X=X.toarray()

In [None]:

X_train, X_val, y_train, y_val = train_test_split(X,train['target'], test_size=0.33, random_state=42)

In [None]:
parameters = {'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)} 
model_nv=MultinomialNB()
clf = GridSearchCV(model_nv,parameters,cv=10, scoring='accuracy')
clf.fit(X_train, y_train)
print("The best Score",clf.best_score_)
print("-------")
print("The best Estimator",clf.best_estimator_)

In [None]:
y_pred=clf.predict(X_val)
accuracy=accuracy_score(y_val,y_pred)
accuracy

In [None]:
Y_Pred=clf.predict(X_val)
cnf_mat=confusion_matrix(y_val, Y_Pred)
fig, ax = plot_confusion_matrix(conf_mat=cnf_mat,figsize=(8, 8),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.show()

In [None]:
print(classification_report(y_val,Y_Pred))

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">AdaBoost-GridSearch</h1>

In [None]:
from sklearn.ensemble import  AdaBoostClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=8),random_state = 42)
parameters = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["random"],#"algorithm" : ["SAMME","SAMME.R"]
              "n_estimators" :[100],
              "learning_rate":  [0.05, 0.5, 1]}
ada_clf = GridSearchCV(ada_clf, parameters, cv=3, scoring="accuracy")
ada_clf.fit(X_train, y_train)
print(f'Best parameters {ada_clf.best_params_}')
print('-----')
print(f'Mean cross-validated accuracy score of the best_estimator: '+f'{ada_clf.best_score_:.3f}')
# Ada_clf = AdaBoostClassifier(DecisionTreeClassifier,n_estimators=5,random_state=1)
# Ada_clf.fit(X_train, y_train)
# y_pred = Ada_clf.predict(X_val)
# score = metrics.accuracy_score(y_val,y_pred)
# print("accuracy: %0.3f" %score)

In [None]:
print("Test Accuracy:",ada_clf.score(X_val, y_val))

In [None]:
Y_Pred=ada_clf.predict(X_val)
cnf_mat=confusion_matrix(y_val, Y_Pred)
fig, ax = plot_confusion_matrix(conf_mat=cnf_mat,figsize=(8, 8),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.show()

In [None]:
print(classification_report(y_val,Y_Pred))

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">CatBoost-GridSearch</h1>

In [None]:
## hyperparameter tuning example grid for catboost : 
import catboost as cb
from catboost import CatBoostClassifier

parameters = {'depth': [4, 7, 10],
              'learning_rate' : [0.03, 0.1, 0.15],
              'l2_leaf_reg': [1,9],
              'iterations': [100]}
cb_clf = cb.CatBoostClassifier()
cb_clf = GridSearchCV(cb_clf, parameters, scoring="roc_auc", cv = 5)
cb_clf.fit(X_train,y_train)
print(f'Best parameters {cb_clf.best_params_}')
print('-----')
print(f'Mean cross-validated accuracy score of the best_estimator: '+f'{cb_clf.best_score_:.3f}')

In [None]:
print("Test Accuracy:",cb_clf.score(X_val, y_val))

In [None]:
Y_Pred=cb_clf.predict(X_val)
cnf_mat=confusion_matrix(y_val, Y_Pred)
fig, ax = plot_confusion_matrix(conf_mat=cnf_mat,figsize=(8, 8),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
plt.show()

<h1 style="background-color:DodgerBlue;text-align:center;color:white;">Neural Net</h1>

In [None]:
voc_size=len(train_feature['text'])+1 #deciding My Vocabulary Size

In [None]:
onehot_representation=[one_hot(words,voc_size) for words in train_feature['text']]
onehot_representation

**Embedding Representation**

In [None]:
sent_length=120 #Since to make each sentence of equal length we are padding. 
embedding=pad_sequences(onehot_representation,padding='post',maxlen=sent_length)
print(embedding)
display(len(embedding))

In [None]:
## Creating model
embedding_vector_features=200
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length,trainable=True)) #Embedding Layer
model.add(LSTM(100)) # 1LSTM Layer with 128 Neurons
#model.add(LSTM(output_nodes, dropout = dropout, recurrent_dropout = recurrent_dropout))
model.add(Dense(1,activation='sigmoid'))#Since Classification type problem so Dense Layer
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
X_final=np.array(embedding)
y_final=np.array(train['target'])

In [None]:
X_final.shape,y_final.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [None]:
history=model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=100)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();
print("Minimum validation loss: {}".format(history_df['val_loss'].min()))

In [None]:
voc_size=len(test_feature['text'])+1 #deciding My Vocabulary Size
test_onehot=[one_hot(words,voc_size) for words in test_feature['text']]
sent_length=120 #Since to make each sentence of equal length we are padding. 
test_embedding=pad_sequences(test_onehot,padding='post',maxlen=sent_length)
print(test_embedding)
display(len(test_embedding))

In [None]:
test_final=np.array(test_embedding)

In [None]:
y_pred=model.predict_classes(test_final)

In [None]:
submission=pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
submission['target']=y_pred
submission.to_csv('new_submission.csv') 
submission.head(5)