<a href="https://www.kaggle.com/code/prasadposture121/twitter-sentiment-analysis-using-dtc?scriptVersionId=135688536" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Twitter Sentiment Analysis using NLTK and DTC

In [1]:
# Importing the dependencies
import pandas as pd
import numpy as np
import re
import string
import nltk
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading the data and renaming the columns
df=pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', header=None)
df.columns=['id','information','label','tweet']
df.head()

Unnamed: 0,id,information,label,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
# Reassigning the data type
df['tweet'] = df['tweet'].astype(str)

In [4]:
# Counting the unique values from the table
df['label'].value_counts()

Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: label, dtype: int64

In [5]:
# Replace categorical values with respective numbers
df=df[df['label']!='Irrelevant']
df.reset_index(inplace=True)
df['label']=df['label'].replace(['Negative','Neutral','Positive'],[-1,0,1])
df['label'].unique()

array([ 1,  0, -1])

In [6]:
# Removing any other patterns
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt
df['clean_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")
df.head()

Unnamed: 0,index,id,information,label,tweet,clean_tweet
0,0,2401,Borderlands,1,im getting on borderlands and i will murder yo...,im getting on borderlands and i will murder yo...
1,1,2401,Borderlands,1,I am coming to the borders and I will kill you...,I am coming to the borders and I will kill you...
2,2,2401,Borderlands,1,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you ...
3,3,2401,Borderlands,1,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
4,4,2401,Borderlands,1,im getting on borderlands 2 and i will murder ...,im getting on borderlands 2 and i will murder ...


In [7]:
# Removing Special Characters
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z#]"," ")
df.head()

Unnamed: 0,index,id,information,label,tweet,clean_tweet
0,0,2401,Borderlands,1,im getting on borderlands and i will murder yo...,im getting on borderlands and i will murder yo...
1,1,2401,Borderlands,1,I am coming to the borders and I will kill you...,I am coming to the borders and I will kill you...
2,2,2401,Borderlands,1,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you ...
3,3,2401,Borderlands,1,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
4,4,2401,Borderlands,1,im getting on borderlands 2 and i will murder ...,im getting on borderlands and i will murder ...


In [8]:
# Removing the shorter words
df['clean_tweet'] = df['clean_tweet'].apply(lambda x:" ".join([w for w in x.split() if len(w)>3]))
df.head()

Unnamed: 0,index,id,information,label,tweet,clean_tweet
0,0,2401,Borderlands,1,im getting on borderlands and i will murder yo...,getting borderlands will murder
1,1,2401,Borderlands,1,I am coming to the borders and I will kill you...,coming borders will kill
2,2,2401,Borderlands,1,im getting on borderlands and i will kill you ...,getting borderlands will kill
3,3,2401,Borderlands,1,im coming on borderlands and i will murder you...,coming borderlands will murder
4,4,2401,Borderlands,1,im getting on borderlands 2 and i will murder ...,getting borderlands will murder


In [9]:
# Converting each tweet into list
tokenized_tweet = df['clean_tweet'].apply(lambda x: x.split())
tokenized_tweet

0                     [getting, borderlands, will, murder]
1                            [coming, borders, will, kill]
2                       [getting, borderlands, will, kill]
3                      [coming, borderlands, will, murder]
4                     [getting, borderlands, will, murder]
                               ...                        
61687    [Just, realized, that, Windows, partition, lik...
61688    [Just, realized, that, window, partition, year...
61689    [Just, realized, windows, partition, years, be...
61690    [Just, realized, between, windows, partition, ...
61691    [Just, like, windows, partition, like, years, ...
Name: clean_tweet, Length: 61692, dtype: object

In [10]:
# Keeping the common words
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
tokenized_tweet

0                          [get, borderland, will, murder]
1                               [come, border, will, kill]
2                            [get, borderland, will, kill]
3                         [come, borderland, will, murder]
4                          [get, borderland, will, murder]
                               ...                        
61687    [just, realiz, that, window, partit, like, yea...
61688    [just, realiz, that, window, partit, year, beh...
61689    [just, realiz, window, partit, year, behind, n...
61690    [just, realiz, between, window, partit, like, ...
61691    [just, like, window, partit, like, year, behin...
Name: clean_tweet, Length: 61692, dtype: object

In [11]:
# Joining the common words back to the sentences and appending them to dataframe
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i]=" ".join(tokenized_tweet[i])
df['clean_tweet']=tokenized_tweet
df.head()

Unnamed: 0,index,id,information,label,tweet,clean_tweet
0,0,2401,Borderlands,1,im getting on borderlands and i will murder yo...,get borderland will murder
1,1,2401,Borderlands,1,I am coming to the borders and I will kill you...,come border will kill
2,2,2401,Borderlands,1,im getting on borderlands and i will kill you ...,get borderland will kill
3,3,2401,Borderlands,1,im coming on borderlands and i will murder you...,come borderland will murder
4,4,2401,Borderlands,1,im getting on borderlands 2 and i will murder ...,get borderland will murder


In [12]:
# Bag of Words: Cause Computer understands numbers not English
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words="english")
bow = bow_vectorizer.fit_transform(df['clean_tweet'])

In [13]:
# Train, Test and Split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(bow, df['label'], random_state=42, test_size=0.25)

In [14]:
# Fitting the model on train data
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)

In [15]:
# Making Predictions on Test
pred = model.predict(x_test)

In [16]:
def accuracy(pred, y_test):
    l=0
    for i,j in zip(pred, y_test):
        if i==j:
            l=l+1
    return l/len(pred)
accuracy(pred, y_test)

0.7914154185307657

In [17]:
# Accuracy on seen data
model = DecisionTreeClassifier(random_state=42)
model.fit(x_train, y_train)
preds=model.predict(x_train)
accuracy(preds,y_train)

0.9326979186928613

In [18]:
model.tree_.max_depth

719

In [19]:
# Finding appropriate values for parameter
# Takes a lot of time
parameter = []
test_accuracy = []
for i in range(1,720):
    parameter.append(i)
    model = DecisionTreeClassifier(random_state=42, max_depth=i)
    model.fit(x_train, y_train)
    pred_test = model.predict(x_test)
    test_acc = accuracy(y_test, pred_test)
    test_accuracy.append(test_acc)
best_param=parameter[test_accuracy.index(max(test_accuracy))]
print('best_param: ',best_param)

best_param:  670


In [20]:
# Training with best paramter
model = DecisionTreeClassifier(random_state=42, max_depth=best_param)
model.fit(x_train, y_train)

In [21]:
# Making Predictions on Test Data
pred = model.predict(x_test)
accuracy(y_test, pred)

0.7944628152758867

In [22]:
# Saving the model
import joblib as jb
tsa = {
    'model':model,
    'bow_vectorizer': bow_vectorizer
}
jb.dump(tsa, 'tsa.joblib')

['tsa.joblib']

In [23]:
# Summurizing the processes which could be used for web application
import pandas as pd
import numpy as np
import re
import string
import nltk
import joblib as jb
import warnings
warnings.filterwarnings("ignore")
tsa = jb.load('tsa.joblib')
model = tsa['model']
bow_vectorizer = tsa['bow_vectorizer']
#input_tweet= input("Enter the tweet here: ") uncomment this before using, comment the below one
input_tweet="India lifts the world cup after 25 years, as the celebration started in the dressing room. Everybody enjoyed that night."
df = pd.DataFrame([{'tweet':input_tweet}])
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt
df['clean_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z#]"," ")
df['clean_tweet'] = df['clean_tweet'].apply(lambda x:" ".join([w for w in x.split() if len(w)>3]))
tokenized_tweet = df['clean_tweet'].apply(lambda x: x.split())
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i]=" ".join(tokenized_tweet[i])
df['clean_tweet']=tokenized_tweet
bow = bow_vectorizer.transform(df['clean_tweet'])
pred = model.predict(bow)
if pred==1:
    print("Positive Tweet")
elif pred==0:
    print("Neutral Tweet")
elif pred==-1:
    print("Negative Tweet")

Positive Tweet


The End