# Twitter Sentiment Analysis

In [1]:
import string
import nltk
from nltk.corpus import wordnet,stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV,cross_val_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
training_data=pd.read_csv('training_twitter_x_train.csv')

In [3]:
training_data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [4]:
test_text = pd.read_csv('training_twitter_x_train.csv')['text']

In [5]:
test_text.head()

0    @SouthwestAir I am scheduled for the morning, ...
1    @SouthwestAir seeing your workers time in and ...
2    @united Flew ORD to Miami and back and  had gr...
3       @SouthwestAir @dultch97 that's horse radish 😤🐴
4    @united so our flight into ORD was delayed bec...
Name: text, dtype: object

In [6]:
texts = training_data['text']
Y_train = training_data['airline_sentiment']

In [7]:
texts

0        @SouthwestAir I am scheduled for the morning, ...
1        @SouthwestAir seeing your workers time in and ...
2        @united Flew ORD to Miami and back and  had gr...
3           @SouthwestAir @dultch97 that's horse radish 😤🐴
4        @united so our flight into ORD was delayed bec...
                               ...                        
10975                              @AmericanAir followback
10976    @united thanks for the help. Wish the phone re...
10977    @usairways the. Worst. Ever. #dca #customerser...
10978    @nrhodes85: look! Another apology. DO NOT FLY ...
10979    @united you are by far the worst airline. 4 pl...
Name: text, Length: 10980, dtype: object

In [8]:
Y_train

0        negative
1        positive
2        positive
3        negative
4        negative
           ...   
10975     neutral
10976    positive
10977    negative
10978    negative
10979    negative
Name: airline_sentiment, Length: 10980, dtype: object

In [9]:
X =[]
X_test = []
for text in texts:
    X.append(word_tokenize(text))

for text in test_text:
    X_test.append(word_tokenize(text))

In [10]:
X[0]

['@',
 'SouthwestAir',
 'I',
 'am',
 'scheduled',
 'for',
 'the',
 'morning',
 ',',
 '2',
 'days',
 'after',
 'the',
 'fact',
 ',',
 'yes',
 '..',
 'not',
 'sure',
 'why',
 'my',
 'evening',
 'flight',
 'was',
 'the',
 'only',
 'one',
 'Cancelled',
 'Flightled']

In [20]:
stopwords = stopwords.words('english') + list(string.punctuation)
lemmatizer = WordNetLemmatizer()

In [22]:
def clean_doc(doc):
    clean_words=[]
    for word in doc:
        if word.lower() not in stopwords and ( word.isalpha()):
            clean_words.append( lemmatizer.lemmatize( word , simple(pos_tag([word])[0][1])).lower())
    return clean_words

In [23]:
def simple(tag):
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('A'):
        return wordnet.ADV
    if tag.startswith('J'):
        return wordnet.ADJ
    else:
        return wordnet.NOUN

In [24]:
X_train= [ ' '.join( clean_doc(doc)) for doc in X ]
X_test= [ ' '.join( clean_doc(doc)) for doc in X_test ]

In [25]:
X_test[0]

'southwestair schedule morning day fact yes sure even flight one cancelled flightled'

In [26]:
Y_train = Y_train.replace({
    'negative' : 0,
    'neutral' : 1,
    'positive' : 2,
})
Y_train

0        0
1        2
2        2
3        0
4        0
        ..
10975    1
10976    2
10977    0
10978    0
10979    0
Name: airline_sentiment, Length: 10980, dtype: int64

In [27]:
cv = CountVectorizer(max_features=3000)
x_train_features=cv.fit_transform(X_train)

In [28]:
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
from sklearn.svm import SVC
svc=SVC()

In [30]:
cross_val_score(svc,x_train_features,Y_train).mean()

0.7761384335154827

In [31]:
from sklearn.naive_bayes import MultinomialNB
model2 = MultinomialNB()
cross_val_score(model2,x_train_features,Y_train).mean()

0.7645719489981786

In [32]:
from sklearn.linear_model import LogisticRegression
model3 = LogisticRegression(max_iter=2000,multi_class='ovr')
cross_val_score(model3,x_train_features,Y_train).mean()

0.785792349726776

In [33]:
from sklearn.ensemble import RandomForestClassifier
model4 = RandomForestClassifier()
cross_val_score(model4,x_train_features,Y_train).mean()

0.7548269581056466

In [34]:
model3.fit(x_train_features,Y_train)

In [35]:
X_test_transform=cv.transform(X_test)

In [36]:
X_test_transform

<10980x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 94857 stored elements in Compressed Sparse Row format>

In [37]:
prediction=model3.predict(X_test_transform)

In [57]:
prediction = pd.Series(prediction).replace({
    0:'negative',
    1:'neutral',
    2:'positive'
})
prediction

0        negative
1        positive
2        positive
3         neutral
4        negative
           ...   
10975     neutral
10976    positive
10977    negative
10978    negative
10979    negative
Length: 10980, dtype: object