In [1]:
# For Handling Data
import pandas as pd

# increase the output column width
pd.set_option('display.max_colwidth', 200)

# For numerical computing
import numpy as np

# Library for pattern matching
import re

# for NLP related tasks
import spacy
nlp=spacy.load('en_core_web_sm',disable=["tagger", "parser","ner"])

In [2]:
# mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# read CSV file
df = pd.read_csv('/content/drive/My Drive/Tweets.csv')

#shape of the dataframe
print('Shape=>',df.shape)

# print first 5 rows
df.head()

Shape=> (14640, 15)


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
# Some sample tweets
df['text'].sample(5)

6159                                                                                                  @SouthwestAir will do! #heart #flying
8541                                                              @JetBlue to allow payment by #Apple Pay - Patriarc http://t.co/bdUaUZFhW2
13624                    @AmericanAir @emeyerson @ggreenwald Don't wait on a bag. Go to Walmart and get what you need for tomorrow morning.
4050     @united - I think she was having a rough moment w/ a bad passenger from an earlier flight. Things got considerably better. Thanks!
8172                    @JetBlue I'm sitting on the plane. Too many predictable mechanical problems.  Still onboard. This is getting bad...
Name: text, dtype: object

In [5]:
# class distribution
df['airline_sentiment'].value_counts()

airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64

In [6]:
# class distribution in percentage
df['airline_sentiment'].value_counts(normalize = True)*100

airline_sentiment
negative    62.691257
neutral     21.168033
positive    16.140710
Name: proportion, dtype: float64

In [7]:
#define a function for text cleaning
def text_cleaner(text):

  #remove user mentions
  text = re.sub(r'@[A-Za-z0-9]+','',text)

  #remove hashtags
  #text = re.sub(r'#[A-Za-z0-9]+','',text)

  #remove links
  text = re.sub(r'http\S+', '', text)

  #convering text to lower case
  text = text.lower()

  # fetch only words
  text = re.sub("[^a-z]+", " ", text)

  # removing extra spaces
  text=re.sub("[\s]+"," ",text)

  # creating doc object
  doc=nlp(text)

  # remove stopwords and lemmatize the text
  tokens=[token.lemma_ for token in doc if(token.is_stop==False)]

  #join tokens by space
  return " ".join(tokens)

In [8]:
# perform text cleaning
df['clean_text']= df['text'].apply(text_cleaner)



In [9]:
# save cleaned text and labels to a variable
text   = df['clean_text'].values
labels = df['airline_sentiment'].values

In [10]:
# Sample cleaned text
text[:10]

array(['  said', '  plus ve added commercials experience tacky',
       '  didn t today mean need trip',
       '  s aggressive blast obnoxious entertainment guests faces amp little recourse',
       '  s big bad thing',
       '  seriously pay flight seats didn t playing s bad thing flying va',
       '  yes nearly time fly vx ear worm won t away',
       '  missed prime opportunity men hats parody', '  didn t d',
       '  amazing arrived hour early good'], dtype=object)

In [None]:
# Sample labels
labels[:10]

In [11]:
#importing label encoder
from sklearn.preprocessing import LabelEncoder

#define label encoder
le = LabelEncoder()

#fit and transform target strings to a numbers
labels = le.fit_transform(labels)

In [12]:
# Sample labels
labels[:10]

array([1, 2, 1, 0, 0, 0, 2, 1, 2, 2])

In [13]:
# Meaning of each label
le.inverse_transform([0,1,2])

array(['negative', 'neutral', 'positive'], dtype=object)

In [14]:
from sklearn.model_selection import train_test_split

# Splitting into train and validation set
x_train,x_val,y_train,y_val=train_test_split(text, labels,stratify=labels, test_size=0.2, random_state=0,shuffle=True)

In [16]:
print('x_train:',x_train.shape,'y_train:',y_train.shape)
print('x_val:',x_val.shape,'y_val:',y_val.shape)

x_train: (11712,) y_train: (11712,)
x_val: (2928,) y_val: (2928,)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# initialize TFIDF
word_vectorizer = TfidfVectorizer(max_features=1000)

In [19]:
# Fitting Vectorizer on Train set
word_vectorizer.fit(x_train)

In [20]:
# create TF-IDF vectors for Train Set
train_word_features = word_vectorizer.transform(x_train)
train_word_features

<11712x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 65703 stored elements in Compressed Sparse Row format>

In [21]:
# Importing for modeling
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

In [22]:
# Training model
nb_model=MultinomialNB().fit(train_word_features,y_train)
nb_model

In [23]:
# Make predictions for train set
train_pred_nb=nb_model.predict(train_word_features)

In [25]:
train_pred_nb

array([0, 0, 0, ..., 2, 0, 0])

In [27]:
# Evaluating on Training Set
print("F1-score on Train Set:",f1_score(y_train,train_pred_nb,average="weighted"))

F1-score on Train Set: 0.7303326366733179


In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
# Training model
lr_model=LogisticRegression().fit(train_word_features,y_train)
lr_model

In [36]:
# Make predictions for train set
train_pred_lr=lr_model.predict(train_word_features)
train_pred_nb

array([0, 0, 0, ..., 2, 0, 0])

In [37]:
# Evaluating on Training Set
print("F1-score on Train Set:",f1_score(y_train,train_pred_lr,average="weighted"))

F1-score on Train Set: 0.8074180766755015


MODEL SUMMARY LOGISTIC REGRESSION PERFORMS BETTER ON THIS DATASET


In [38]:
def sentiment_analyzer(tweet):
  # Cleaning Tweet
  cleaned_tweet=text_cleaner(tweet)

  # Feature Engineering
  tweet_vector=word_vectorizer.transform([cleaned_tweet])

  # Predicting Sentiment
  label=lr_model.predict(tweet_vector)

  return le.inverse_transform(np.array(label))

In [39]:
sentiment_analyzer("@USAirways flt 419. 2+ hrs Late Flight, baggage + 1 more hr. Now I see they delivered my suitcase wet inside &amp; out. #NotHappy")



array(['negative'], dtype=object)