In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import sklearn
import tensorflow

In [None]:
df_train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
df_test  = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')

In [None]:
df_train.head()

In [None]:
df_test.head()


In [None]:
print(df_train.shape)
df_test.shape

In [None]:
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
import nltk


In [None]:
nltk.download_shell()   

# Stopwords is already installed.

In [None]:
from nltk.corpus import stopwords

In [None]:
print(df_train.columns)
print(df_test.columns)

In [None]:
print(df_train.info())
df_test.info()

In [None]:

print(df_train.isna().sum())

# only 2 null values out of 24000 total values.
# let's drop them.


df_test.isna().sum()

# No null values in test dataset.

In [None]:
df_train.dropna(inplace=True)


In [None]:
df_train.isna().sum()

# No null values left.

# Exploratory Data Analysis

In [None]:
# adding a column of text_length
df_train['text_length'] = df_train['text'].apply(lambda x : len(x))

df_test['text_length'] = df_test['text'].apply(lambda x : len(x))

In [None]:

sns.set_style(style='whitegrid')
plt.figure(figsize=(10,5))
sns.distplot(df_train['text_length'],color='green')

# normal distributed data

In [None]:
g = sns.FacetGrid(data=df_train,col='sentiment',height=4)
g.map(sns.distplot,'text_length')

In [None]:
df_train['sentiment'].value_counts().iplot(kind='bar',color='black')

# Maximum Neutral texts

In [None]:
df_test['sentiment'].value_counts().iplot(kind='bar',color='purple')

In [None]:
import string


In [None]:
print(df_train['text'][4])
df_train['selected_text'][4].split()

In [None]:

def sel_tex(i):
    split_text = i.split()
    return split_text

In [None]:
df_train['selected_text2'] = df_train['selected_text'].apply(sel_tex)


In [None]:
df_train.head()

# Feature Engineering

# OPTION 1

### Using selected_text column of the Train Dataset for predictions.


In [None]:
# selected_text column of test dataset will bo on the basis of selected_text of Train dataset to 
#    predict better for types of messages.


select_text = pd.Series(df_train['selected_text'])


list1 = ' '.join(select_text)


list2 = list1.split()

In [None]:
def test_select(i):
    l  = [ ]
    for w in i.split():
        if w in list2:
            l.append(w)
    return(l)

In [None]:
df_test['selected_text'] = df_test['text'].apply(test_select)


In [None]:
df_test.head(6)


In [None]:
df_train.head(1)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer


# Fitting and Training the Model

In [None]:
bag_of_words = CountVectorizer(analyzer=test_select).fit(df_test['text'])

In [None]:
df_test_bow_trans = bag_of_words.transform(df_test['text'])

In [None]:
df_test_bow_trans


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf = TfidfTransformer().fit(df_test_bow_trans)

In [None]:
df_test_tfidf = tfidf.transform(df_test_bow_trans)

In [None]:
df_test_tfidf.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
sentiment_detect_model = MultinomialNB().fit(df_test_tfidf,df_test['sentiment'])

In [None]:
all_sentiments_predictions = sentiment_detect_model.predict(df_test_tfidf)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report


In [None]:
print(confusion_matrix(all_sentiments_predictions,df_test['sentiment']))


In [None]:
print(classification_report(all_sentiments_predictions,df_test['sentiment']))


# ACCURACY = 81 %

# OPTION 2

### Adding a new selected_text column in the Test Dataset on the basis of Test Data text column.

In [None]:
df_test  = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')


In [None]:
df_test.head()


In [None]:
df_test['text_length'] = df_test['text'].apply(lambda x : len(x))


In [None]:
def test_select(i):
    list_text = [text for text in i if text not in string.punctuation]
    join_test_text = ''.join(list_text)
    clean_test_text = [ text for text in join_test_text.split() if text.lower() not in stopwords.words('english')]
    return clean_test_text

In [None]:
df_test['selected_text'] = df_test['text'].apply(test_select)


In [None]:
df_test.head()


In [None]:
bag_of_words = CountVectorizer(analyzer=test_select).fit(df_test['text'])


df_test_bow_trans = bag_of_words.transform(df_test['text'])


tfidf = TfidfTransformer().fit(df_test_bow_trans)


df_test_tfidf = tfidf.transform(df_test_bow_trans)


sentiment_detect_model = MultinomialNB().fit(df_test_tfidf,df_test['sentiment'])


all_sentiments_predictions = sentiment_detect_model.predict(df_test_tfidf)

In [None]:
print(confusion_matrix(all_sentiments_predictions,df_test['sentiment']))


In [None]:
print(classification_report(all_sentiments_predictions,df_test['sentiment']))


# ACCURACY = 91 %

In [None]:
# Therefore , option 2 has increased accuracy by 10%.

## Option 1 = 81 %

## Option 2 = 91 %

# Submission

In [None]:
df_test.head(2)

In [None]:
def joined(i):
    joined = " , ".join(i)
    return joined

In [None]:
df_test['selected_text2'] = df_test['selected_text'].apply(joined)

In [None]:
df_test.head()

In [None]:
df_test2 = df_test[['textID','selected_text2']]

In [None]:
df_test2.rename(columns={'selected_text2':'selected_text'},inplace=True)

In [None]:
df_test2.head(1)

In [None]:
df_test2.to_csv('submission.csv',index=False)