# Assignment 4: Twitter Airline Sentiment Analysis
 <ul> 
    <li> Dataset: <a href= 'https://www.kaggle.com/crowdflower/twitter-airline-sentiment/version/2'> Twitter US Airline Sentiment  </a> </li>
    <li> Contains positive, neutral, and negative tweets directed at six US airlines </li>
 </ul>

In [202]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import preprocessor as p
import pandas as pd
import numpy as np
p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.MENTION)

In [313]:
file_location = "../../data/Tweets.csv"
columns = ['airline_sentiment', 'airline', 'text']
tweets_df = pd.read_csv(file_location, usecols = columns)

In [245]:
class TweetPreprocessor:
    def __init__(self, df):
        self.df = df
    
    def fit(self, tweet_col, target_col):
        np.random.shuffle(self.df.values)
        self.drop_nulls()
        self.lower_case(column_name = tweet_col)
        self.preprocess_tweets(tweet_col)
        self.label_encode(target_col)
        
        count_vect = CountVectorizer()
        counts = count_vect.fit_transform(self.df[tweet_col])
        transformer = TfidfTransformer().fit(counts)
        normalized_counts = transformer.transform(counts)  
        
        return normalized_counts, self.df,  
    
    def label_encode(self, target_col):
        le = preprocessing.LabelEncoder()
        self.df[target_col] = le.fit_transform(self.df[target_col])
    
    def preprocess_tweets(self, tweet_col):
        self.df[tweet_col] = self.df[tweet_col].apply(p.clean)
        
    def token_counts(self, tweet_col):
        count_vect = CountVectorizer()
        counts = count_vect.fit_transform(self.df[tweet_col])
        return counts
    
    def normalize_counts(self, counts):
        transformer = TfidfTransformer().fit(counts)
        return transformer.transform(counts)
    
    def drop_nulls(self):
        if(self.df.isna().sum().sum()>0):
            self.df.dropna(inplace=True)
        
    def lower_case(self, column_name):
        self.df[column_name] = self.df[column_name].str.lower()

In [381]:
target_col = 'airline_sentiment'
preprocessor = TweetPreprocessor(tweets_df)
normalized_counts, df = preprocessor.fit(tweet_col = 'text', target_col = target_col)
print(df.shape)
X_train, X_test, y_train, y_test = train_test_split(normalized_counts, df[target_col], test_size=0.1)

(14640, 3)


In [377]:
positive_sentiment = {}

for airline in np.unique(df['airline']):
    airline_tweets = df[df['airline'] == airline]
    positive_tweet_count = len(airline_tweets[airline_tweets['airline_sentiment'] == 2])
    positive_sentiment[airline] = np.round(positive_tweet_count/len(airline_tweets),2)

positive_airline = [key for key in positive_sentiment.keys() if 
                    positive_sentiment[key] == max(positive_sentiment.values())][0]

In [380]:
df.groupby('airline').mean()

Unnamed: 0_level_0,airline_sentiment
airline,Unnamed: 1_level_1
American,0.411381
Delta,0.815032
Southwest,0.745455
US Airways,0.315482
United,0.439822
Virgin America,0.94246


In [378]:
positive_airline

'Virgin America'

In [340]:
alpha_values = list(np.round(np.random.random_sample(5),2))
fit_prior_value = False
accuracies = []

for alpha_val in alpha_values:
    model = MultinomialNB(alpha=alpha_val, fit_prior = fit_prior_value).fit(X_train, y_train)
    predicted = model.predict(X_test)
    accuracy = np.round(np.mean(predicted == y_test),2)
    accuracies.append(accuracy)

In [341]:
results = {'alpha':alpha_values, 'fit_prior': len(accuracies)*[False], 'accuracy': accuracies}
results_df = pd.DataFrame(data = results)

In [342]:
results_df

Unnamed: 0,alpha,fit_prior,accuracy
0,0.28,False,0.76
1,0.41,False,0.76
2,0.71,False,0.77
3,0.28,False,0.76
4,0.08,False,0.75


In [347]:
df['airline_sentiment'].value_counts()

0    9178
1    3099
2    2363
Name: airline_sentiment, dtype: int64