In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score

In [None]:
sample_submission=pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
test_df=pd.read_csv("../input/nlp-getting-started/test.csv")
train_df=pd.read_csv("../input/nlp-getting-started/train.csv")

# Quick EDA

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df['keyword'].unique()

In [None]:
train_df['location'].unique()

In [None]:
data = [train_df.groupby(['target']).count()['id'][0], train_df.groupby(['target']).count()['id'][1]]
colors = sns.color_palette('pastel')
labels = ['not-disaster', 'disaster']
plt.pie(data, colors = colors,labels=labels,autopct = '%0.0f%%')
plt.show()

In [None]:
palette=sns.color_palette('magma')
data=pd.DataFrame((train_df[train_df['target']==1]).groupby(['keyword']).count().id.sort_values(ascending=False)[:20]).reset_index()
fig=plt.figure(figsize=(30,6))
ax = fig.add_axes([0,0,1,1])
ax.bar(data.keyword,data.id)
ax.set_ylabel('Count')
ax.set_xlabel('A particular keywords from the disaster tweet (may be blank)')

In [None]:
data=pd.DataFrame((train_df[train_df['target']==0]).groupby(['keyword']).count().id.sort_values(ascending=False)[:20]).reset_index()
fig=plt.figure(figsize=(30,6))
ax = fig.add_axes([0,0,1,1])
ax.bar(data.keyword,data.id)
ax.set_ylabel('Count')
ax.set_xlabel('A particular keywords from the not-disaster tweet (may be blank)')

In [None]:
data=pd.DataFrame(train_df.groupby('location').count().id.sort_values(ascending=False)[:20]).reset_index()
fig=plt.figure(figsize=(30,6))
ax = fig.add_axes([0,0,1,1])
ax.bar(data.location,data.id)
ax.set_ylabel('Count')
ax.set_xlabel('The location the tweet was sent from (may also be blank)')

In [None]:
#WordCloud for the Sentiments
for label, cmap in zip([1,0],
                       ['magma','winter']):
    text = train_df.query('target == @label')['text'].str.cat(sep=' ')
    plt.figure(figsize=(10, 6))
    wc = WordCloud(width=1000, height=600, background_color="#f8f8f8", colormap=cmap)
    wc.generate_from_text(text)
    plt.imshow(wc)
    plt.axis("off")
    plt.title(f"Words Commonly Used in which target is ${label}$", size=20)
    plt.show()

# Data Pre-Processing

In [None]:
train_df.replace(regex={'%20': ' '},inplace=True)

**Replacing chat messages to real message**

In [None]:
train_df.replace(regex={'AFAIK':'As Far As I Know',
' AFK ':' Away From Keyboard ',
' ASAP ':' As Soon As Possible ',
' ATK ':' At The Keyboard ',
' ATM ':' At The Moment ',
' A3 ':' Anytime, Anywhere, Anyplace ',
' BAK ':' Back At Keyboard ',
' BBL ':' Be Back Later ',
' BBS ':' Be Back Soon ',
' BFN ':' Bye For Now ',
' B4N ':' Bye For Now ',
' BRB ':' Be Right Back ',
' BRT ':' Be Right There ',
' BTW ':' By The Way ',
' B4 ':' Before ',
' B4N ':' Bye For Now ',
' CU ':' See You ',
' CUL8R ':' See You Later ',
' CYA ':' See You ',
' FAQ ':' Frequently Asked Questions ',
' FC ':' Fingers Crossed ',
' FWIW ':" For What It's Worth ",
' FYI ':' For Your Information ',
' GAL ':' Get A Life ',
' GG ':' Good Game ',
' GN ':' Good Night ',
' GMTA ':' Great Minds Think Alike ',
' GR8 ':" Great! ",
' G9 ':" Genius ",
' IC ':" I See ",
' ICQ ':" I Seek you ",
' ILU ':" I Love You ",
' IMHO ':" In My Honest ",
' IMO ':" In My Opinion ",
' IOW ':" In Other Words ",
' IRL ':" In Real Life ",
' KISS ':" Keep It Simple, Stupid ",
' LDR ':" Long Distance Relationship ",
' LMAO ':" Laugh My Ass ",
' LOL ':" Laughing Out Loud ",
' LTNS ':" Long Time No See ",
' L8R ':" Later ",
' MTE ':" My Thoughts Exactly ",
' M8 ':" Mate ",
' NRN ':" No Reply Necessary ",
' OIC ':" Oh I See ",
' PITA ':" Pain In The Ass ",
' PRT ':" Party ",
' PRW ':" Parents Are Watching ",
' ROFL ':" Rolling On The Floor Laughing ",
' ROFLOL ':" Rolling On The Floor Laughing Out Loud ",
' ROTFLMAO ':" Rolling On The Floor Laughing My Ass ",
' SK8 ':" Skate ",
' STATS ':" Your sex and age ",
' ASL ':" Age, Sex, Location ",
' THX ':" Thank You ",
' TTFN ':" Ta-Ta For Now! ",
' TTYL ':" Talk To You Later ",
' U ':" You ",
' U2 ':" You Too",
' U4E ':" Yours For Ever ",
' WB ':" Welcome Back ",
' WTF ':" What The Fuck ",
' WTG ':" Way To Go! ",
' WUF ':" Where Are You From? ",
' W8 ':" Wait... ",},inplace=True)

In [None]:
train_df.replace(regex={r'https?://\S+': ' ',r'<.*?>':' ',r'\d+':' ',r'#\w+':' ','[^a-zA-Z]': ' ',r'http\S+': ' '},inplace=True)

**removing stop words**

In [None]:
sw=stopwords.words('english')
train_df['text'][2]
train_df['text'] = train_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

In [None]:
train_df['text'][2]

In [None]:
vectorizer = TfidfVectorizer(ngram_range = (1,2))
X = vectorizer.fit_transform(train_df['text'])
target = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [None]:
y=train_df['target']

In [None]:
x_train, x_val, y_train, y_val= train_test_split(target, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [None]:
print(x_train.shape)
print(x_val.shape)

# Model Training

In [None]:
mlp=MLPClassifier(random_state=0, early_stopping=True, verbose=2)
mlp.fit(x_train, y_train)

In [None]:
y_pred_mlp=mlp.predict(x_val)
cm_mlp = confusion_matrix(y_val, y_pred_mlp)
print('MLP Accuracy:', accuracy_score(y_val, y_pred_mlp))
print(classification_report(y_val, y_pred_mlp))
sns.heatmap(cm_mlp, annot=True, fmt='g', cbar=False)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('MLP Confusion Matrix')
plt.show()

In [None]:
scaler = MinMaxScaler()
lr = LogisticRegression(solver='liblinear', random_state=777)
pipeline = Pipeline([('scale',scaler), ('lr', lr),])
pipeline.fit(x_train, y_train)
y_pred_lr = pipeline.predict(x_val)

In [None]:
print ('Training f-1 score: %.4f' % f1_score(y_train, pipeline.predict(x_train)))

In [None]:
cm_lr = confusion_matrix(y_val, y_pred_lr)
print('lr Accuracy:', accuracy_score(y_val, y_pred_lr))
print(classification_report(y_val, y_pred_lr))
sns.heatmap(cm_lr, annot=True, fmt='g', cbar=False)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('lr Confusion Matrix')
plt.show()

# Submission

In [None]:
test_df.replace(regex={'%20': ' '},inplace=True)
test_df.replace(regex={'AFAIK':'As Far As I Know',
' AFK ':' Away From Keyboard ',
' ASAP ':' As Soon As Possible ',
' ATK ':' At The Keyboard ',
' ATM ':' At The Moment ',
' A3 ':' Anytime, Anywhere, Anyplace ',
' BAK ':' Back At Keyboard ',
' BBL ':' Be Back Later ',
' BBS ':' Be Back Soon ',
' BFN ':' Bye For Now ',
' B4N ':' Bye For Now ',
' BRB ':' Be Right Back ',
' BRT ':' Be Right There ',
' BTW ':' By The Way ',
' B4 ':' Before ',
' B4N ':' Bye For Now ',
' CU ':' See You ',
' CUL8R ':' See You Later ',
' CYA ':' See You ',
' FAQ ':' Frequently Asked Questions ',
' FC ':' Fingers Crossed ',
' FWIW ':" For What It's Worth ",
' FYI ':' For Your Information ',
' GAL ':' Get A Life ',
' GG ':' Good Game ',
' GN ':' Good Night ',
' GMTA ':' Great Minds Think Alike ',
' GR8 ':" Great! ",
' G9 ':" Genius ",
' IC ':" I See ",
' ICQ ':" I Seek you ",
' ILU ':" I Love You ",
' IMHO ':" In My Honest ",
' IMO ':" In My Opinion ",
' IOW ':" In Other Words ",
' IRL ':" In Real Life ",
' KISS ':" Keep It Simple, Stupid ",
' LDR ':" Long Distance Relationship ",
' LMAO ':" Laugh My Ass ",
' LOL ':" Laughing Out Loud ",
' LTNS ':" Long Time No See ",
' L8R ':" Later ",
' MTE ':" My Thoughts Exactly ",
' M8 ':" Mate ",
' NRN ':" No Reply Necessary ",
' OIC ':" Oh I See ",
' PITA ':" Pain In The Ass ",
' PRT ':" Party ",
' PRW ':" Parents Are Watching ",
' ROFL ':" Rolling On The Floor Laughing ",
' ROFLOL ':" Rolling On The Floor Laughing Out Loud ",
' ROTFLMAO ':" Rolling On The Floor Laughing My Ass ",
' SK8 ':" Skate ",
' STATS ':" Your sex and age ",
' ASL ':" Age, Sex, Location ",
' THX ':" Thank You ",
' TTFN ':" Ta-Ta For Now! ",
' TTYL ':" Talk To You Later ",
' U ':" You ",
' U2 ':" You Too",
' U4E ':" Yours For Ever ",
' WB ':" Welcome Back ",
' WTF ':" What The Fuck ",
' WTG ':" Way To Go! ",
' WUF ':" Where Are You From? ",
' W8 ':" Wait... ",},inplace=True)

In [None]:
test_df.replace(regex={r'https?://\S+': ' ',r'<.*?>':' ',r'\d+':' ',r'#\w+':' ','[^a-zA-Z]': ' ',r'http\S+': ' '},inplace=True)
test_df['text'] = test_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))
X = vectorizer.transform(test_df['text'])
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [None]:
y_sub_pred_mlp=mlp.predict(X)
sample_submission.target=y_sub_pred_mlp
sample_submission.to_csv('submission.csv',index=False)