In [None]:
import pandas as pd
# Load the transfer learning tweet dataset
df = pd.read_csv('../input/twitterdata/finalSentimentdata2.csv')
df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#checking if there any NAN value or not
df.isnull().sum()

In [None]:
# droping the unnecesary colmns 
df.drop(['Unnamed: 0'], axis='columns', inplace=True)

In [None]:
df.columns

In [None]:
df['sentiment'].value_counts()

In [None]:

sns.set_style("whitegrid")
plt.figure(figsize=(10,5))
sns.countplot(df['sentiment'])

In [None]:

from sklearn.preprocessing import LabelEncoder
scaler=LabelEncoder()
df['sentiment']=scaler.fit_transform(df['sentiment'])

In [None]:
#1 for fear
#3 for sad
#0 for anger
#2 for joy
df['sentiment'].value_counts()

In [None]:
df.columns

In [None]:
df['text'][:5]

In [None]:
df.shape

In [None]:
#making the copy of original datasets
message=df.copy()

In [None]:
message.head(3)

In [None]:
message.isnull().sum()

# Let's take a single paragraph for example message['text'][0] and check how the cleaning is going on after that it will be done in whole datasets.

In [None]:
message['text'][0]

In [None]:
para=''''agree the poor in india are treated badly their poors 
seek a living in singapore and are treated like citizens they 
are given free medical treatment given food daily sim cards
to call home to tell their family that they are fine if covid 
19 case treated foc in hospitals'''

# stemming and cleaning the text

In [None]:
import nltk
import re

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []

review=re.sub(r'http\S+',' ',para) #removing all the link releted text
review = re.sub('[^a-zA-Z]', ' ', review)# removing all the element except a-z and A-Z
review = review.lower()#lowering the text
review = review.split()

#removing all the stopwords and then stemming the text 
review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
review = ' '.join(review)
corpus.append(review)

In [None]:
corpus

# Its seems that 80% cleaning is done by stemming now let's check it my lemmatizing

In [None]:
from nltk.stem import WordNetLemmatizer
lem=WordNetLemmatizer()

corpus_lem=[]
review=re.sub(r'http\S+',' ',para)#removing all the link releted text
review = re.sub('[^a-zA-Z]', ' ', review)# removing all the element except a-z and A-Z
review = review.lower()#lowering the text
review = review.split()

#removing all the stopwords and then lemmatizing the text 
review=[lem.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
review=' '.join(review)
corpus_lem.append(review)
    
corpus_lem

# Now lets apply this in whole datasets for cleaning the text.

In [None]:
#stemming and cleaning
corpus_stem = []
for i in range(0, len(message)):
    review=re.sub(r'http\S+',' ',message['text'][i])
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus_stem.append(review)

In [None]:
len(message)

In [None]:
df2=pd.DataFrame(df['sentiment'],index=None)

In [None]:
df2['stemming_text']=corpus_stem

In [None]:
df2.head(4)

# Most commonly used Anger words 

In [None]:

from wordcloud import WordCloud
anger_text = df2[df2['sentiment'] == 0]
all_words = ' '.join([text for text in anger_text.stemming_text])
wordcloud = WordCloud(width= 1000, height= 800,
                          max_font_size = 120,
                          collocations = False).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Most Used anger words", fontsize=20)
plt.axis("off")
plt.show()

# Most commonly used Fear words

In [None]:

from wordcloud import WordCloud
fear_text = df2[df2['sentiment'] == 1]
all_words = ' '.join([text for text in fear_text.stemming_text])
wordcloud = WordCloud(width= 1000, height= 800,
                          max_font_size = 120,
                          collocations = False).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Most Used fear words", fontsize=20)
plt.axis("off")
plt.show()

# Most commonly used Joy words

In [None]:
from wordcloud import WordCloud
joy_text = df2[df2['sentiment'] == 2]
all_words = ' '.join([text for text in joy_text.stemming_text])
wordcloud = WordCloud(width= 1000, height= 800,
                          max_font_size = 120,
                          collocations = False).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Most Used joy words", fontsize=20)
plt.axis("off")
plt.show()

# Most commonly used Sad words

In [None]:
from wordcloud import WordCloud
sad_text = df2[df2['sentiment'] == 3]
all_words = ' '.join([text for text in sad_text.stemming_text])
wordcloud = WordCloud(width= 1000, height= 800,
                          max_font_size = 120,
                          collocations = False).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Most Used sad words", fontsize=20)
plt.axis("off")
plt.show()

In [None]:
from nltk import tokenize
token_space = tokenize.WhitespaceTokenizer()
def counter(text, column_text, quantity):
    all_words = ' '.join([text for text in text[column_text]])
    token_phrase = token_space.tokenize(all_words)
    frequency = nltk.FreqDist(token_phrase)
    df_frequency = pd.DataFrame({"Word": list(frequency.keys()),
                                   "Frequency": list(frequency.values())})
    df_frequency = df_frequency.nlargest(columns = "Frequency", n = quantity)
    plt.figure(figsize=(15,8))
    ax = sns.barplot(data = df_frequency, x = "Word", y = "Frequency", color = 'yellow')
    ax.set(ylabel = "Count")
    plt.xticks(rotation='vertical')
    plt.show()
    
    


In [None]:
#frequency of most anger words
counter(df2[df2['sentiment'] == 0], 'stemming_text', 20)

In [None]:
#frequency of most fear words
counter(df2[df2['sentiment'] == 1], 'stemming_text', 20)

In [None]:
#frequency of most joy words
counter(df2[df2['sentiment'] == 2], 'stemming_text', 20)

In [None]:
#frequency of most sad words
counter(df2[df2['sentiment'] == 3], 'stemming_text', 20)

In [None]:
#lammetizing and cleaning

corpus_lemmetize = []
for i in range(0, len(message)):
    review=re.sub(r'http\S+',' ',message['text'][i])
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    
    review = [lem.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus_lemmetize.append(review)

In [None]:
corpus_lemmetize[:2]

In [None]:
len(corpus_lemmetize)

In [None]:
#countvectoriser with stemming

from sklearn.feature_extraction.text import CountVectorizer
CV=CountVectorizer(max_features=5000, ngram_range=(1,3))
x_stem=CV.fit_transform(corpus_stem).toarray()
x_stem

In [None]:
print(x_stem.shape)

In [None]:
#countvectorizer with lemmetizing

from sklearn.feature_extraction.text import CountVectorizer
CV=CountVectorizer(max_features=5000)
x_lem=CV.fit_transform(corpus_lemmetize).toarray()
x_lem

In [None]:
print(x_lem.shape)

In [None]:
#TF-IDF for stemming

from sklearn.feature_extraction.text import TfidfVectorizer
tf_stem=TfidfVectorizer()
x_tf_stem=tf_stem.fit_transform(corpus_stem)
print(x_tf_stem.shape)


In [None]:
#TF-IDF for lemmatizing

from sklearn.feature_extraction.text import TfidfVectorizer
tf_stem=TfidfVectorizer()
x_tf_lem=tf_stem.fit_transform(corpus_lemmetize)
print(x_tf_lem.shape)

In [None]:
y=df['sentiment']
y[:5]

#  Making the model by using Countvectorizer and Stemming

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(x_stem,y,test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB().fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred=model.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
cm

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.figure(figsize=(10,5))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')

# Making the model by using Lemmetizing and Countvectorizer

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(x_lem,y,test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB().fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred=model.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
cm

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')

# Making the model by using TF-IDF and stemming

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(x_tf_stem,y,test_size=0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB().fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred=model.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
cm

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')

# Making the model by using TF-IDF and lemmetizing

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(x_tf_lem,y,test_size=0.2)

In [None]:
x_tf_lem.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB().fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred=model.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
cm

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')


# Let's check the accuracy after applying hyperparameter in MultinomialNB

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(x_stem,y,test_size=0.2)

classifier=MultinomialNB(alpha=0.1)

from sklearn import metrics
import numpy as np
previous_score=0
for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))

# when alpha=0.6 its gives the maximumn accuracy of 0.6747572815533981

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import  DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix

# Now let's check with other classifier algorithm like DecisionTreeClassifier, RandamForestClassifier, SVM, LogisticRegression by using GridSearchCV and crossvalidation

In [None]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20,25,30,40],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10,15,20,25,30]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear'),
        'params': {
            'C': [1,5,10,15,20,25]
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params':{
            'criterion':['gini','entropy']
        }
    }
}

In [None]:
scores = []

X_train, X_test, y_train, y_test= train_test_split(x_stem,y,test_size=0.2)

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train,y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df_score = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_score

# Hence, It can conclude that Logistic Regression with the accuracy of 0.69