# Assignment 4: Twitter Airline Sentiment Analysis
 <ul> 
    <li> Dataset: <a href= 'https://www.kaggle.com/crowdflower/twitter-airline-sentiment/version/2'> Twitter US Airline Sentiment  </a> </li>
    <li> Contains positive, neutral, and negative tweets directed at six US airlines </li>
 </ul>

In [29]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import preprocessor as p
import pandas as pd
import numpy as np
p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.MENTION)

In [30]:
file_location = "../../data/Tweets.csv"
columns = ['airline_sentiment', 'airline', 'text']
tweets_df = pd.read_csv(file_location, usecols = columns)

In [31]:
class TweetPreprocessor:
    def __init__(self, df):
        self.df = df
    
    def fit(self, tweet_col, target_col):
        np.random.shuffle(self.df.values)
        self.drop_nulls()
        self.lower_case(column_name = tweet_col)
        self.preprocess_tweets(tweet_col)
        self.label_encode(target_col)
        
        count_vect = CountVectorizer()
        counts = count_vect.fit_transform(self.df[tweet_col])
        transformer = TfidfTransformer().fit(counts)
        normalized_counts = transformer.transform(counts)  
        
        return normalized_counts, self.df,  
    
    def label_encode(self, target_col):
        le = LabelEncoder()
        self.df[target_col] = le.fit_transform(self.df[target_col])
    
    def preprocess_tweets(self, tweet_col):
        self.df[tweet_col] = self.df[tweet_col].apply(p.clean)
        
    def token_counts(self, tweet_col):
        count_vect = CountVectorizer()
        counts = count_vect.fit_transform(self.df[tweet_col])
        return counts
    
    def normalize_counts(self, counts):
        transformer = TfidfTransformer().fit(counts)
        return transformer.transform(counts)
    
    def drop_nulls(self):
        if(self.df.isna().sum().sum()>0):
            self.df.dropna(inplace=True)
        
    def lower_case(self, column_name):
        self.df[column_name] = self.df[column_name].str.lower()

In [32]:
target_col = 'airline_sentiment'
preprocessor = TweetPreprocessor(tweets_df)
normalized_counts, df = preprocessor.fit(tweet_col = 'text', target_col = target_col)
print(df.shape)
X_train, X_test, y_train, y_test = train_test_split(normalized_counts, df[target_col], test_size=0.1)

(14640, 3)


In [33]:
positive_sentiment = {}

for airline in np.unique(df['airline']):
    airline_tweets = df[df['airline'] == airline]
    positive_tweet_count = len(airline_tweets[airline_tweets['airline_sentiment'] == 2])
    positive_sentiment[airline] = np.round(positive_tweet_count/len(airline_tweets),2)

positive_airline = [key for key in positive_sentiment.keys() if 
                    positive_sentiment[key] == max(positive_sentiment.values())][0]

In [34]:
df.groupby('airline').mean()

Unnamed: 0_level_0,airline_sentiment
airline,Unnamed: 1_level_1
American,0.411381
Delta,0.815032
Southwest,0.745455
US Airways,0.315482
United,0.439822
Virgin America,0.94246


In [35]:
positive_airline

'Virgin America'

In [36]:
alpha_values = list(np.round(np.random.random_sample(5),2))
fit_prior_value = False
accuracies = []

for alpha_val in alpha_values:
    model = MultinomialNB(alpha=alpha_val, fit_prior = fit_prior_value).fit(X_train, y_train)
    predicted = model.predict(X_test)
    accuracy = np.round(np.mean(predicted == y_test),2)
    accuracies.append(accuracy)

In [37]:
results = {'alpha':alpha_values, 'fit_prior': len(accuracies)*[False], 'accuracy': accuracies}
results_df = pd.DataFrame(data = results)

In [57]:
max_accuracy = results_df['accuracy'].max()
max_row = results_df[results_df['accuracy'] == max_accuracy]
optimal_alpha = list(max_row['alpha'])[0]

0.65

In [43]:
results_df

Unnamed: 0,alpha,fit_prior,accuracy
0,0.05,False,0.75
1,0.65,False,0.78
2,0.81,False,0.77
3,0.59,False,0.78
4,0.68,False,0.78


In [39]:
df['airline_sentiment'].value_counts()

0    9178
1    3099
2    2363
Name: airline_sentiment, dtype: int64

In [40]:
df

Unnamed: 0,airline_sentiment,airline,text
0,0,United,been on hold on the phone for well over half a...
1,0,American,weather was not involved.
2,1,Delta,"hello good afternoon how are you, i need know ..."
3,0,Delta,continuing you record of never having a flight...
4,0,US Airways,uh yeah. flight boarded &amp; now 1 hr late fl...
...,...,...,...
14635,2,Southwest,de-icing is important!
14636,2,United,flew from sdf to atl to tampa on delta. left e...
14637,0,Southwest,we understand air delays which are out of your...
14638,0,American,im still on hold...
