
#**Importing important packages**


In [None]:
import re
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report

# **Loading Training Dataset**

In [None]:
d_train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
d_train.info()
print('\n-------------------------------------------------------------------------------------------------------\n')
d_train.head()

As per the information about the following dataset, there is one null entry under ' text ' and ' selected_text ' coloumn, we have to remove that entire row.

In [None]:
#checking for null values in dataset
null_row = pd.isnull(d_train['text'])
d_train[null_row]

The row with index no. 314 have null entries.

In [None]:
#removing the values with null entries
d_train.drop([d_train.index[314]] , inplace=True)

#again getting info of dataset
d_train.info()

**Hence now we can see that our data has no null entries.**

In [None]:
#storing important columns as tweets and sentiments
train_tweets = d_train['text']
train_sentiments = d_train['sentiment']

#check for imbalance or balance dataset
train_sentiments.value_counts()

**Our dataset is also fairly balanced so we dont have to do upsampling or downsampling.**

# **Text Preprocessing(TRAINING DATA)**
It is the most important process for text classification problems as it decreases the the size of our dataset corpus be removing such words that do not add much value to our system and it also simplifies our data.
---
**We'll be doing that in following order**:


1.   Every word of our data will be converted to lower case.
2.   Removing all usernames, for eg. @abc.
3.   Removing all the websites and URL's.
4.   Replacing all the positive emojis by the word 'positive' and negative emoji with word 'negative'.
5.   Removing all the alphnumeric words.
6.   Replace all the alphabets of words that occur more than twice to two       times. For eg. 'reallly' to 'really'  
7.   Removing stopwords.
8.   Removing words of length of smaller than 2. 
9.   Lemmatization
10.   Removing all the punctuation marks.


In [None]:
semi_final_train_tweets = []
#converting all the words of tweet into lower case 
for tweet in train_tweets.values :    
    tweet = tweet.lower()

#removing all websites and URL's
    tweet = re.sub(r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)" , '' , tweet) 

#removing usernames
    tweet = re.sub('@[^\s]+', '' , tweet)

#replacing emojis
    tweet = re.sub(r'(<3|:\*)', ' positive ', tweet)
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' positive ', tweet)
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:O)', ' positive ', tweet)
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;|@-\))', ' positive ', tweet)
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' negative ', tweet) 
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:|:-/|:-\|)', ' negative ', tweet)

#removing all the alphanumeric words
    tweet = re.sub("[^a-zA-Z]", " ", tweet)

#replacing consecutive letters
    tweet = re.sub(r"(.)\1\1+", r"\1\1", tweet)

    semi_final_train_tweets.append(tweet)
            

**Now comes the part of removing stopwords.The list of stopwords that NLTK provides has some words which should not be considered as stopwords because removing them sometimes changes the complete meaning of sentence. For eg. 'not' , 'against' etc etc. So we'll process after removing such words from NLTK stopwords list.**
---



In [None]:
stop_w=list(stopwords.words('english'))
stop_w.remove('not')
stop_w.remove('against')
stop_w.remove('doing')
stop_w.remove('couldn')
stop_w.remove('didn')
stop_w.remove('doesn')
stop_w.remove('hadn')
stop_w.remove('hasn')
stop_w.remove('haven')
stop_w.remove('isn')
stop_w.remove('mightn')
stop_w.remove('mustn')
stop_w.remove('needn')
stop_w.remove('wasn')
stop_w.remove('weren')
stop_w.remove('won')
stop_w.remove('wouldn')

#defining lemmatizer
lematizer = WordNetLemmatizer()

In [None]:
final_train_tweets = []

for tweet in semi_final_train_tweets:
    sentence = []   
    tweet = tweet.split()
    for each_word in tweet:
        if len(each_word) >= 2:
            if each_word not in stop_w:
                each_word = lematizer.lemmatize(each_word)
                sentence.append(each_word)
        else:
            continue
             
    string = ' '.join(sentence)   
    final_train_tweets.append(string) 

In [None]:
final_train_tweets[5]

Text preprocessing of been training data has finally been done. We had to follow the same process for test data also.

# **Loading Test Dataset**

In [None]:
d_test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
d_test.info()
print('\n-------------------------------------------------------------------------------------------------------\n')
d_test.head()
test_tweets = d_test['text']
test_sentiments = d_test['sentiment']

Test data has no null values.

# **Test Preprocessing(TEST DATA)**

In [None]:
final_test_tweets = []
 
for tweet in test_tweets.values :    
    tweet = tweet.lower()
    tweet = re.sub(r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)" , '' , tweet) 
    tweet = re.sub('@[^\s]+', '' , tweet)
    tweet = re.sub(r'(<3|:\*)', ' smile ', tweet)
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' win ', tweet)
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\)|:O)', ' happy ', tweet)
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;|@-\))', ' smile ', tweet)
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' vampire ', tweet) 
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:|:-/|:-\|)', ' sad ', tweet)
    tweet = re.sub("[^a-zA-Z]", " ", tweet)
    tweet = re.sub(r"(.)\1\1+", r"\1\1", tweet)
    sentence = []   
    tweet = tweet.split()
    for each_word in tweet:
        if len(each_word) >= 2:
            if each_word not in stop_w:
                each_word = lematizer.lemmatize(each_word)
                sentence.append(each_word)
        else:
            continue
             
    string = ' '.join(sentence)   
    final_test_tweets.append(string) 

final_test_tweets[2]    

# **TF-IDF Vectorisation of Training and Test Data**

In [None]:
#converting train tweets to vectors
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
x_train = tf_idf_vect.fit(final_train_tweets)
x_train = tf_idf_vect.transform(final_train_tweets)


#converting test tweets to vectors
x_test = tf_idf_vect.transform(final_test_tweets)

# **Classification using Multinomial Naive Bayes**

In [None]:
classifier = MultinomialNB()
classifier = classifier.fit(x_train , train_sentiments)
pred_sentiments = classifier.predict(x_test)
accuracy = accuracy_score(pred_sentiments , test_sentiments)
print('Accuracy of MultinomialNB is ',accuracy)
print('\n')
print(classification_report(test_sentiments,pred_sentiments))

In [None]:
#plotting confusion_matrix of MultinomialNB
print('Confusion Matrix of MultinomialNB is given below:')
cf_matrix = confusion_matrix(test_sentiments,pred_sentiments)
#sns.heatmap(cf_matrix/np.sum(cf_matrix) , figsize=(8,8) , annot=True , fmt='0.2%')
plt.subplots(figsize=(10,10))
sns.set(font_scale=2)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True , cmap='BuPu',linecolor='white' , linewidths=3)
plt.xlabel("Predicted values", fontdict = {'size':22})
plt.ylabel("Actual values"   , fontdict = {'size':22})
plt.title ("Confusion Matrix", fontdict = {'size':22})

# **Classification using Logistic Regression**

In [None]:
#we will use GridSearch to find for perfect hyperparameter
tuned_parameters = [{'C': [ 10**-4,10**-2,10**0,10**2,10**4 ]}]
x = LogisticRegression(penalty='l1', solver="liblinear")
classifier = GridSearchCV(x , tuned_parameters , cv =5)
#, scoring = ['accuracy' , 'f1'],refit=f1_score
classifier.fit(x_train , train_sentiments)

In [None]:
pred_sentiments = classifier.predict(x_test)
accuracy = accuracy_score(pred_sentiments , test_sentiments)
print(accuracy) 
#
print('\n')
print(classification_report(test_sentiments,pred_sentiments))

In [None]:
#plotting confusion_matrix of Logistic_Regression
print('Confusion Matrix of Logistic Regression is given below:')
cf_matrix = confusion_matrix(test_sentiments,pred_sentiments)
plt.subplots(figsize=(10,10))
sns.set(font_scale=2)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True , cmap='BuPu',linecolor='green' , linewidths=3)
plt.xlabel("Predicted values", fontdict = {'size':22})
plt.ylabel("Actual values"   , fontdict = {'size':22})
plt.title ("Confusion Matrix", fontdict = {'size':22})

# **Accuracy from MultinomialNB model = 63.15%**
# **Accuracy from Logistic Regression model = 72.85%** 
# **Hence we can conclude that Logistic Regression is better model**