# Twitter Sentiment Analysis using NLTK and DTC

In [62]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import warnings
warnings.filterwarnings('ignore')

In [63]:
df=pd.read_csv('/kaggle/input/twitter-sentiment-analysis-analytics-vidya/train_E6oV3lV.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [64]:
df['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [65]:
df0 = df[df['label']==0].sample(frac=0.07543741588156123)
df1 = df[df['label']==1].copy()
df = pd.concat([df0, df1], axis=0)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,id,label,tweet
0,25587,0,green morning :) #smoothie #greensmoothie #bre...
1,8016,0,today my parents celebrate 53 years of marriag...
2,10034,0,who needs a #superhero when you have #daddy - ...
3,21843,0,i can't wait for my e.l.f products to come in ...
4,10793,0,@user i completely forgot that was home to @u...


In [66]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt

In [67]:
df['clean_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")
df.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,25587,0,green morning :) #smoothie #greensmoothie #bre...,green morning :) #smoothie #greensmoothie #bre...
1,8016,0,today my parents celebrate 53 years of marriag...,today my parents celebrate 53 years of marriag...
2,10034,0,who needs a #superhero when you have #daddy - ...,who needs a #superhero when you have #daddy - ...
3,21843,0,i can't wait for my e.l.f products to come in ...,i can't wait for my e.l.f products to come in ...
4,10793,0,@user i completely forgot that was home to @u...,i completely forgot that was home to for ye...


In [68]:
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z#]"," ")

In [69]:
df['clean_tweet'] = df['clean_tweet'].apply(lambda x:" ".join([w for w in x.split() if len(w)>3]))
df.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,25587,0,green morning :) #smoothie #greensmoothie #bre...,green morning #smoothie #greensmoothie #breakf...
1,8016,0,today my parents celebrate 53 years of marriag...,today parents celebrate years marriage love de...
2,10034,0,who needs a #superhero when you have #daddy - ...,needs #superhero when have #daddy #fathersday ...
3,21843,0,i can't wait for my e.l.f products to come in ...,wait products come mail #elfcosmetics
4,10793,0,@user i completely forgot that was home to @u...,completely forgot that home years


In [70]:
tokenized_tweet = df['clean_tweet'].apply(lambda x: x.split())

In [71]:
tokenized_tweet

0       [green, morning, #smoothie, #greensmoothie, #b...
1       [today, parents, celebrate, years, marriage, l...
2       [needs, #superhero, when, have, #daddy, #fathe...
3             [wait, products, come, mail, #elfcosmetics]
4                 [completely, forgot, that, home, years]
                              ...                        
4479    [lady, banned, from, kentucky, mall, #jcpenny,...
4480    [omfg, offended, mailbox, proud, #mailboxpride...
4481    [have, balls, hashtag, weasel, away, lumpy, to...
4482       [makes, yourself, then, anybody, until, thank]
4483    [#sikh, #temple, vandalised, #calgary, #wso, c...
Name: clean_tweet, Length: 4484, dtype: object

In [72]:
from nltk.stem.porter import PorterStemmer

In [73]:
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
tokenized_tweet

0       [green, morn, #smoothi, #greensmoothi, #breakf...
1       [today, parent, celebr, year, marriag, love, d...
2       [need, #superhero, when, have, #daddi, #father...
3                 [wait, product, come, mail, #elfcosmet]
4                     [complet, forgot, that, home, year]
                              ...                        
4479    [ladi, ban, from, kentucki, mall, #jcpenni, #k...
4480    [omfg, offend, mailbox, proud, #mailboxprid, #...
4481    [have, ball, hashtag, weasel, away, lumpi, ton...
4482        [make, yourself, then, anybodi, until, thank]
4483    [#sikh, #templ, vandalis, #calgari, #wso, cond...
Name: clean_tweet, Length: 4484, dtype: object

In [74]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i]=" ".join(tokenized_tweet[i])
df['clean_tweet']=tokenized_tweet
df.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,25587,0,green morning :) #smoothie #greensmoothie #bre...,green morn #smoothi #greensmoothi #breakfast #...
1,8016,0,today my parents celebrate 53 years of marriag...,today parent celebr year marriag love dedic th...
2,10034,0,who needs a #superhero when you have #daddy - ...,need #superhero when have #daddi #fathersday #...
3,21843,0,i can't wait for my e.l.f products to come in ...,wait product come mail #elfcosmet
4,10793,0,@user i completely forgot that was home to @u...,complet forgot that home year


In [75]:
df['clean_tweet']

0       green morn #smoothi #greensmoothi #breakfast #...
1       today parent celebr year marriag love dedic th...
2       need #superhero when have #daddi #fathersday #...
3                       wait product come mail #elfcosmet
4                           complet forgot that home year
                              ...                        
4479       ladi ban from kentucki mall #jcpenni #kentucki
4480    omfg offend mailbox proud #mailboxprid #libera...
4481     have ball hashtag weasel away lumpi toni dipshit
4482               make yourself then anybodi until thank
4483          #sikh #templ vandalis #calgari #wso condemn
Name: clean_tweet, Length: 4484, dtype: object

In [76]:
from sklearn.feature_extraction.text import CountVectorizer

In [77]:
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words="english")
bow = bow_vectorizer.fit_transform(df['clean_tweet'])

In [78]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(bow, df['label'], random_state=42, test_size=0.25)

In [79]:
from sklearn.tree import DecisionTreeClassifier

In [80]:
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)

In [81]:
from sklearn.metrics import f1_score, accuracy_score
pred = model.predict(x_test)
f1_score(y_test, pred)

0.8036036036036036

In [82]:
parameter = []
test_accuracy = []
for i in range(1,1000):
    parameter.append(i)
    model = DecisionTreeClassifier(random_state=42, max_depth=i)
    model.fit(x_train, y_train)
    pred_test = model.predict(x_test)
    test_acc = accuracy_score(y_test, pred_test)
    test_accuracy.append(test_acc)
best_param=parameter[test_accuracy.index(max(test_accuracy))]
print('best_param: ',best_param)

best_param:  241


In [83]:
model = DecisionTreeClassifier(random_state=42, max_depth=best_param)
model.fit(x_train, y_train)

In [84]:
from sklearn.metrics import f1_score, accuracy_score
pred = model.predict(x_test)
f1_score(y_test, pred)

0.8106060606060606

In [85]:
accuracy_score(y_test, pred)

0.8215878679750223

In [86]:
# Saving the model
import joblib as jb
tsa = {
    'model':model,
    'bow_vectorizer': bow_vectorizer
}
jb.dump(tsa, 'tsa.joblib')

['tsa.joblib']

In [87]:
# Summurizing the processes which could be used for web application
import pandas as pd
import numpy as np
import re
import string
import nltk
import joblib as jb
import warnings
warnings.filterwarnings("ignore")
tsa = jb.load('tsa.joblib')
model = tsa['model']
bow_vectorizer = tsa['bow_vectorizer']
input_tweet= input("Enter the tweet here: ")
df = pd.DataFrame([{'tweet':input_tweet}])
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt
df['clean_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z#]"," ")
df['clean_tweet'] = df['clean_tweet'].apply(lambda x:" ".join([w for w in x.split() if len(w)>3]))
tokenized_tweet = df['clean_tweet'].apply(lambda x: x.split())
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i]=" ".join(tokenized_tweet[i])
df['clean_tweet']=tokenized_tweet
bow = bow_vectorizer.transform(df['clean_tweet'])
pred = model.predict(bow)
pred
if pred==0:
    print("the tweet is postitive")
else:
    print("the tweet is negative")

Enter the tweet here:  kill thigh enemy


the tweet is negative


The End