In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
from wordcloud import WordCloud
import datetime as dt
import seaborn as sb
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

parser = lambda x: dt.datetime.strptime(x[:-6], '%Y-%m-%d %H:%M:%S')
data = pd.read_csv("../input/Tweets.csv",parse_dates=[12], date_parser = parser)
data.head()

In [None]:
# Check the ratio of positive and negative tweets for each airline
data['countval'] = 1
groupby_object = data[['airline','airline_sentiment','countval']] \
                 .groupby(['airline','airline_sentiment']).aggregate(sum)
groupby_object.unstack(level=1).plot(kind='bar',figsize=(12, 8))
plt.show()

In [None]:
data['dow'] = data.tweet_created.dt.dayofweek

g = sb.FacetGrid(data, row = 'airline_sentiment', 
                 hue = 'airline', legend_out = True,
                 aspect = 4, size = 2.5)
g.map(sb.distplot, 'dow', hist = False)
g.add_legend()
g.axes.flat[0].set_xlim(0,6)
g.axes.flat[2].set_xlabel('Day of Week')

In [None]:
pf = data.groupby(['negativereason']).airline.value_counts()
my_plot = pf.unstack().plot(kind='bar',stacked=True,figsize=(12, 16),rot=0,title="Negetive Reasons by Airlines")
locs, labels = plt.xticks()
plt.setp(labels, rotation=90)
my_plot.set_xlabel("Negative Reason")
my_plot.set_ylabel("Airline")

In [None]:
# Word cloud for POSITIVE tweets
df=data[data['airline_sentiment']=='positive'] 
# join positive tweets to a single string
words = ' '.join(df['text'])
# remove URLs, RTs, and twitter handles
cleaned_words = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'
                            ])
wordcloud = WordCloud(background_color='black',
                      width=2000,
                      height=1500
                     ).generate(cleaned_words)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# Word cloud for Negative tweets
df=data[data['airline_sentiment']=='negative'] 
# join positive tweets to a single string
words = ' '.join(df['text'])
# remove URLs, RTs, and twitter handles
cleaned_words = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'
                            ])
wordcloud = WordCloud(background_color='black',
                      width=2000,
                      height=1500
                     ).generate(cleaned_words)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# Word cloud for NEUTRAL tweets
df=data[data['airline_sentiment']=='neutral'] 
# join positive tweets to a single string
words = ' '.join(df['text'])
# remove URLs, RTs, and twitter handles
cleaned_words = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'
                            ])
wordcloud = WordCloud(background_color='black',
                      width=2000,
                      height=1500
                     ).generate(cleaned_words)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
def find_emojies ( raw_review, emoji ):
    for a in raw_review:
        a = a.encode('unicode_escape')
        if str(a,'utf-8').lower().startswith('\\u') and str(a,'utf-8') != '\\ufe0f':
            key = a.decode('unicode_escape') + " "+str(a,'utf-8')
            if key in emoji: 
                emoji[key] += 1
            else:
                emoji[key] = 1

emoji = dict()
for tweet in data['text']:
    find_emojies(tweet, emoji)
    
for key in emoji:
     print (key, emoji[key])

In [None]:
import re
          
def replace_emoji (raw_review):
    encoded_review = str(raw_review.encode('unicode_escape'),'utf-8')
    
    encoded_review = encoded_review.replace('\\U0001f44d', 'like ') 
    encoded_review = encoded_review.replace('\\U0001f44c', 'great ')
    encoded_review = encoded_review.replace('\\U0001f60e', 'cool ')
    encoded_review = encoded_review.replace('\\U0001f44f', 'applause ')
    encoded_review = encoded_review.replace('\\U0001f44e', 'dislike ')
    encoded_review = encoded_review.replace('\\U0001f618', 'kiss ')
    encoded_review = encoded_review.replace('\\U0001f625', 'disappointed ')
    encoded_review = encoded_review.replace('\\U0001f389', 'celebrate ')
    encoded_review = encoded_review.replace('\\U0001f64c', 'celebrate ')
    encoded_review = encoded_review.replace('\\U0001f494', 'heartbreak ')
    
    happy_emoticons = ['\\u263a','\\U0001f60a','\\U0001f603','\\U0001f601','\\U0001f602','\\U0001f600',
                       '\\U0001f604','\\U0001f609','\\U0001f61c','\\U0001f624','\\U0001f60b','\\U0001f60f',
                       '\\U0001f605','\\U0001f3b5','\\U0001f606',':)',':-)','=)',';)',';-)',':D',':-D',';P',':P',';D']
    sad_emoticons = ['\\U0001f622','\\U0001f62d','\\U0001f61e','\\U0001f614','\\U0001f615','\\U0001f623','\\U0001f613',':(',':|',':-(']
    heart_emoticons = ['\\u2764','\\U0001f60d','\\U0001f49c','\\U0001f495','\\U0001f497','\\u2665','\\U0001f496','\\U0001f498','\\U0001f49d','\\U0001f499',]
    angry_emoticons = ['\\U0001f621','\\U0001f612','\\U0001f62c','\\U0001f611','\\U0001f620','\\U0001f610','\\U0001f626']
    surprised_emoticons = ['\\U0001f631','\\U0001f62e','\\U0001f632']
    tired_emoticons = ['\\U0001f629','\\U0001f62b','\\U0001f62a']
    
    for emoticon in happy_emoticons:
        encoded_review = encoded_review.replace(emoticon, 'happy ') 
    for emoticon in sad_emoticons:
        encoded_review = encoded_review.replace(emoticon, 'sad ')
    for emoticon in heart_emoticons:
        encoded_review = encoded_review.replace(emoticon, 'heart ')
    for emoticon in angry_emoticons:
        encoded_review = encoded_review.replace(emoticon, 'angry ')
    for emoticon in surprised_emoticons:
        encoded_review = encoded_review.replace(emoticon, 'surprised ')
    for emoticon in tired_emoticons:
        encoded_review = encoded_review.replace(emoticon, 'tired ')  
            
    decoded_review = bytes(encoded_review,'utf-8').decode('unicode_escape')
    return decoded_review
    
def tweet_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw tweet), and 
    # the output is a single string (a preprocessed tweet)
    #
    review_text = raw_review  
   
    # 1. Remove non-letters and tweeter handlers
    
    no_handlers = re.sub("@[a-zA-Z1-9]+", " ", review_text)
    letters_only = re.sub("[^a-zA-Z]", " ", no_handlers)
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()                            
  
    # 3. Remove stop words and lematize them
    meaningful_words = [wordnet_lemmatizer.lemmatize(w) for w in words]
    #  
    # 4. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join( meaningful_words ))

In [None]:
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer

# Set stemmer
#if stemmer_name == "snowball":
    #stemmer = SnowballStemmer("english")
#else:
    #stemmer = PorterStemmer()
    
wordnet_lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words("english"))   
negations = ['no', 'not','didn', 'won','couldn','haven']
stops = [x for x in stops if x not in negations]

processed_tweets = []
for tweet in data['text']:
    tweet = replace_emoji(tweet)
    processed = tweet_to_words(tweet)
    processed_tweets.append(processed)
     
#data["text"] = processed_tweets

vect = CountVectorizer()
processed_text = vect.fit_transform(processed_tweets)

In [None]:
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

X_train, X_test, y_train, y_test = train_test_split(processed_text, data['airline_sentiment'], test_size=0.20,random_state=42)

In [None]:
# Train Multinominal Naive Bayes

model = MultinomialNB()
model.fit(X_train, y_train)

# make predictions
expected_bayes = y_test
predicted_bayes = model.predict(X_test)

# summarize the fit of the model
print(metrics.classification_report(expected_bayes, predicted_bayes))
print(metrics.confusion_matrix(expected_bayes, predicted_bayes))

In [None]:
# Train Random Forest Classifier
model  = RandomForestClassifier()
model.fit(X_train, y_train)

# make predictions
expected_rfc = y_test
predicted_rfc = model.predict(X_test)

# summarize the fit of the model
print(metrics.classification_report(expected_rfc, predicted_rfc))
print(metrics.confusion_matrix(expected_rfc, predicted_rfc))

In [None]:
# Train SVM
model = svm.LinearSVC(C=1.0, random_state=0, class_weight='balanced')
model.fit(X_train, y_train)

# make predictions
expected_svm = y_test
predicted_svm = model.predict(X_test)

# summarize the fit of the model
print(metrics.classification_report(expected_svm, predicted_svm))
print(metrics.confusion_matrix(expected_svm, predicted_svm))

In [None]:
# Train SVM with Tfidf
tf = TfidfVectorizer()
processed_text = tf.fit_transform(processed_tweets)
X_train, X_test, y_train, y_test = train_test_split(processed_text, data['airline_sentiment'], test_size=0.20,random_state=42)

model = svm.LinearSVC(C=1.0, random_state=0, class_weight='balanced')
model.fit(X_train, y_train)

# make predictions
expected_tf = y_test
predicted_tf = model.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected_tf, predicted_tf))
print(metrics.confusion_matrix(expected_tf, predicted_tf))

In [None]:
# Predict tweet sentiment with dictionary approach 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
tweet_scores = [sid.polarity_scores(tweet) for tweet in processed_tweets]  
#print(tweet_scores[:20])
predicted_dict = []
for score in tweet_scores:
    if score['compound'] < 0:
        predicted_dict.append('negative')
    elif score['compound'] < 0.35:
        predicted_dict.append('neutral')
    else:
        predicted_dict.append('positive')
print(metrics.classification_report(data['airline_sentiment'], predicted_dict))
print(metrics.confusion_matrix(data['airline_sentiment'], predicted_dict))

In [None]:
errors = []   
for tweet, e_sentiment, p_sentiment in zip( X_test, expected_tf.values, predicted_tf):
    if e_sentiment != p_sentiment:
        errors.append((tweet, e_sentiment,p_sentiment))
		#print('%s => %s' % (tweet, p_sentiment))
idx = 0
for error in errors:
    a =[idx[1] for idx in np.transpose(np.nonzero(error[0]))]
    words=[]
    for i in a:
        words.append(tf.get_feature_names()[i])
    if idx < 30:
        for tweet in data['text']:
            if len(set(words) - set(tweet_to_words(replace_emoji(tweet)).split())) == 0:
                print(tweet)
                print("Expected: "+error[1]+'\nPredicted: '+error[2])
                print("---------------------------------------------------------------------------")
                break
        
    idx = idx + 1