In [1]:
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
# from nltk.stem import PorterStemmer # for stemming
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import collections
# from sklearn.feature_extraction.text import CountVectorizer # for bag of words vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score  
import warnings
import joblib

In [2]:
# importing file
file = pd.read_csv('Accuracy_tweets_data.csv')

df = pd.DataFrame(file)
# df = df.head(5000)

df['text'] = df['text'].astype(str)  
warnings.filterwarnings('ignore')

In [3]:
df1 = df[['text','airline_sentiment']]
df1

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
...,...,...
14635,@AmericanAir thank you we got on a different f...,positive
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative
14637,@AmericanAir Please bring American Airlines to...,neutral
14638,"@AmericanAir you have my money, you change my ...",negative


In [4]:
# cleaning text
def filter_txt(tweets):
    tweets = tweets.lower()
    tweets = re.sub(r'@[A-Za-z0-9]+:?', '', tweets)  # removing @mentions
    tweets = re.sub(r'#','',tweets) # removing #
    tweets = re.sub(r'https?://(t|www).([a-zA-Z0-9/]+)','',tweets) # removing https links
    tweets = re.sub(r'^(,|.|-)','',tweets) 
    tweets = re.sub(r'\s+',' ',tweets.strip()) # trim if more then one whitespace
    
    tweets = tweets.replace("won't",'will not')
    tweets = tweets.replace("didn't",'did not')
    tweets = tweets.replace("can't",'can not')
    tweets = tweets.replace("don't",'do not')
    tweets = tweets.replace("isn't",'is not')
    tweets = tweets.replace("hasn't",'has not')
    tweets = tweets.replace("haven't",'have not')

    tweets = tweets.replace("you're",'you are')
    tweets = tweets.replace("they're",'they are')
    tweets = tweets.replace("we're",'we are')
    
    tweets = tweets.replace("we'll",'we will')
    tweets = tweets.replace("you'll",'you will')
    tweets = tweets.replace("he'll",'he will')
    tweets = tweets.replace("she'll",'she will')
    tweets = tweets.replace("they'll",'they will')

    tweets = tweets.replace("i'm",'i am')
    
    tweets = tweets.replace("it's",'it is')
    tweets = tweets.replace("he's",'he is')
    tweets = tweets.replace("she's",'she is')
    tweets = tweets.replace("that's",'that is')


    tweets = tweets.replace("you've",'you have')
    tweets = tweets.replace("i've",'i have')
    tweets = tweets.replace("they've",'they have')
    
    tweets = re.sub(r'[^\w]', ' ', tweets) # remove not alphanumeric
    tweets = tweets.lstrip()


    return tweets

In [5]:
df1['text'] = df1['text'].apply(filter_txt)
pd.set_option('display.max_colwidth', None)
df1

Unnamed: 0,text,airline_sentiment
0,what said,neutral
1,plus you have added commercials to the experience tacky,positive
2,i did not today must mean i need to take another trip,neutral
3,it is really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse,negative
4,and it is a really big bad thing about it,negative
...,...,...
14635,thank you we got on a different flight to chicago,positive
14636,leaving over 20 minutes late flight no warnings or communication until we were 15 minutes late flight that is called shitty customer svc,negative
14637,please bring american airlines to blackberry10,neutral
14638,you have my money you change my flight and do not answer your phones any other suggestions so i can make my commitment,negative


In [6]:
# tokenization
df1['tokens'] = df1['text'].apply(lambda x: nltk.word_tokenize(x))

#lemmatization
lt = WordNetLemmatizer()
df1['lemmed'] = df1['tokens'].apply(lambda x: [lt.lemmatize(y) for y in x if y not in stopwords.words('english')])
df1['lemmed'] = df1['lemmed'].apply(lambda x: ' '.join(map(str, x))) # joining words to make a sentence
df1

# Stemming 
# st = PorterStemmer()
# df1['stemmed'] = df1['tokens'].apply(lambda x: [st.stem(y) for y in x if y not in stopwords.words('english')])
# df1['stemmed'] = df1['stemmed'].apply(lambda x: ' '.join(map(str, x))) # joining words to make a sentence
# df1

Unnamed: 0,text,airline_sentiment,tokens,lemmed
0,what said,neutral,"[what, said]",said
1,plus you have added commercials to the experience tacky,positive,"[plus, you, have, added, commercials, to, the, experience, tacky]",plus added commercial experience tacky
2,i did not today must mean i need to take another trip,neutral,"[i, did, not, today, must, mean, i, need, to, take, another, trip]",today must mean need take another trip
3,it is really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse,negative,"[it, is, really, aggressive, to, blast, obnoxious, entertainment, in, your, guests, faces, amp, they, have, little, recourse]",really aggressive blast obnoxious entertainment guest face amp little recourse
4,and it is a really big bad thing about it,negative,"[and, it, is, a, really, big, bad, thing, about, it]",really big bad thing
...,...,...,...,...
14635,thank you we got on a different flight to chicago,positive,"[thank, you, we, got, on, a, different, flight, to, chicago]",thank got different flight chicago
14636,leaving over 20 minutes late flight no warnings or communication until we were 15 minutes late flight that is called shitty customer svc,negative,"[leaving, over, 20, minutes, late, flight, no, warnings, or, communication, until, we, were, 15, minutes, late, flight, that, is, called, shitty, customer, svc]",leaving 20 minute late flight warning communication 15 minute late flight called shitty customer svc
14637,please bring american airlines to blackberry10,neutral,"[please, bring, american, airlines, to, blackberry10]",please bring american airline blackberry10
14638,you have my money you change my flight and do not answer your phones any other suggestions so i can make my commitment,negative,"[you, have, my, money, you, change, my, flight, and, do, not, answer, your, phones, any, other, suggestions, so, i, can, make, my, commitment]",money change flight answer phone suggestion make commitment


In [7]:
# word and frequency 
df1['lemmed'] = df1['lemmed'].astype(str)

bagsofwords = [collections.Counter(re.findall(r'\w+', txt)) for txt in df1['lemmed']]
sumbags = sum(bagsofwords, collections.Counter()) # to view total count of particular word
sumbags

Counter({'said': 180,
         'plus': 58,
         'added': 20,
         'commercial': 17,
         'experience': 215,
         'tacky': 1,
         'today': 433,
         'must': 36,
         'mean': 92,
         'need': 632,
         'take': 299,
         'another': 273,
         'trip': 231,
         'really': 300,
         'aggressive': 3,
         'blast': 2,
         'obnoxious': 2,
         'entertainment': 25,
         'guest': 8,
         'face': 14,
         'amp': 683,
         'little': 68,
         'recourse': 5,
         'big': 72,
         'bad': 186,
         'thing': 133,
         'seriously': 79,
         'would': 561,
         'pay': 139,
         '30': 182,
         'flight': 4584,
         'seat': 483,
         'playing': 8,
         'flying': 315,
         'va': 10,
         'yes': 238,
         'nearly': 14,
         'every': 117,
         'time': 968,
         'fly': 387,
         'vx': 7,
         'ear': 2,
         'worm': 1,
         'go': 342,
         'awa

In [8]:
# # Converting the text into vectors using BAG OF WORDS
# cv = CountVectorizer()
# X = cv.fit_transform(df1['lemmed']).toarray()

# Creating Dataframe of the vectors using BAG OF WORDS
# cv_dataframe=pd.DataFrame(X,columns=cv.get_feature_names())
# pd.set_option('display.max_columns', None)
# print(cv_dataframe)

In [9]:
# Converting the text into vectors using TF-IDF
cv = TfidfVectorizer()
X = cv.fit_transform(df1['lemmed']).toarray()

# Creating Dataframe of the vectors using TF-IDF
cv_dataframe=pd.DataFrame(X,columns=cv.get_feature_names())
pd.set_option('display.max_columns', None)
print(cv_dataframe)

        00  000  000114  000419  000ft  000lbs  0011  0016  00a  00am  00p  \
0      0.0  0.0     0.0     0.0    0.0     0.0   0.0   0.0  0.0   0.0  0.0   
1      0.0  0.0     0.0     0.0    0.0     0.0   0.0   0.0  0.0   0.0  0.0   
2      0.0  0.0     0.0     0.0    0.0     0.0   0.0   0.0  0.0   0.0  0.0   
3      0.0  0.0     0.0     0.0    0.0     0.0   0.0   0.0  0.0   0.0  0.0   
4      0.0  0.0     0.0     0.0    0.0     0.0   0.0   0.0  0.0   0.0  0.0   
...    ...  ...     ...     ...    ...     ...   ...   ...  ...   ...  ...   
14635  0.0  0.0     0.0     0.0    0.0     0.0   0.0   0.0  0.0   0.0  0.0   
14636  0.0  0.0     0.0     0.0    0.0     0.0   0.0   0.0  0.0   0.0  0.0   
14637  0.0  0.0     0.0     0.0    0.0     0.0   0.0   0.0  0.0   0.0  0.0   
14638  0.0  0.0     0.0     0.0    0.0     0.0   0.0   0.0  0.0   0.0  0.0   
14639  0.0  0.0     0.0     0.0    0.0     0.0   0.0   0.0  0.0   0.0  0.0   

       00pm   01  0162389030167  0162424965446  0162431184663  

In [10]:
X.shape

(14640, 12177)

In [11]:
Y = cv.fit_transform(df1['airline_sentiment']).toarray()
Y

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [12]:
Y = cv.fit_transform(df1['airline_sentiment']).toarray()
Y

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size= 0.10) 
model = DecisionTreeClassifier()
model.fit(X_train, Y_train) 
prediction = model.predict(X_test)

In [28]:
score = accuracy_score(Y_test,prediction)
score

0.6905737704918032

In [29]:
Y_test

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [30]:
prediction

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [31]:
# joblib.dump(model,'ml.joblib')