# Problem statment: 
# Twitter US Airline Sentiment

##  Objective: Analyze how travelers in February 2015 expressed their feelings on Twitter


#### In current data set we have tweets for 6 US airlines and we need to predict whether the tweets are positive, negative or neutral

This is a typical supervised learning task, where we are given a problem statement and we need to clasify them into pre-defined categories.

#### Below are the steps we'll be following :
     1. Data preparation
         a. analyze missing values
         b. remove redundant columns
     2. EDA:
         a. Analyze different moods
     3. Clean the tweet messages:
         a. Remove all the special characters
         b. remove all single characters
         c. Substituting multiple spaces with single space
         d. convert all letters to lower case
         e. stemming words
         f. filter out engish stop words 
     4. Test-Train split
     5. Use TF-IDF technique to create features from text
     6. Attempt model: Descision Tree
     7. Model using Decision Tree, Random forest compare accuracy
     
         
         

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime

import re
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics

from sklearn.metrics import accuracy_score, classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


import warnings
warnings.filterwarnings("ignore")

In [None]:
tweets_df = pd.read_csv("../input/twitter-airline-sentiment/Tweets.csv")

In [None]:
tweets_df.head()

In [None]:
tweets_df.shape

In [None]:
tweets_df.columns

In [None]:
# Check if any of the columns have unique values
nonunique_cols = [featr for featr in tweets_df.columns if len(tweets_df[featr].unique()) <2]
nonunique_cols

#### missing value analysis:

In [None]:
#Check for missing values
100*tweets_df.isna().sum()/len(tweets_df)

we observe that airline_sentiment_gold, negativereason_gold and tweet_coord have more tha 90% of missing values, let us drop them as they don't provide any constructive feedback

In [None]:
tweets_df.drop(['airline_sentiment_gold', 'negativereason_gold', 'tweet_coord'], axis=1, inplace =True)

In [None]:
100*tweets_df.isna().sum()/len(tweets_df)

In [None]:
tweets_df[['negativereason', 'negativereason_confidence', 'tweet_location', 'user_timezone']].head()

## EDA

In [None]:
# Data balance
def createPieChartFor(t_df):
    Lst = 100*t_df.value_counts()/len(t_df)
    
    # set data for pie chart
    labels = t_df.value_counts().index.values
    sizes =  Lst 
    
    # set labels
    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, labels=labels, autopct='%1.2f%%', shadow=True, startangle=90)
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

In [None]:
createPieChartFor(tweets_df.airline_sentiment)

from above we can see that we have majority of negative comments (63%) followed by neutral (21%) and positive (16%)

In [None]:
createPieChartFor(tweets_df.airline)

Let us :
1. now check total tweets for each of the airlines and
2. how many of these tweets per airline are negative, positive and neutral

In [None]:
airline_sentiment_df = tweets_df.groupby(['airline','airline_sentiment']).airline_sentiment.count().unstack()
airline_sentiment_df.plot(kind='bar')
plt.legend(bbox_to_anchor=(1.04,1), loc="upper left")

In [None]:
airline_sentiment_df.plot(kind='bar', stacked=True)
plt.legend(bbox_to_anchor=(1.04,1), loc="upper left")

From above graph we can see that
1. United, US Airways and American have substatially negative tweets, these also have got over all more tweets
2. Virgin America, Delta and Southwest have fairly balanced tweets

Let's convert tweet_created to datetime check if we can get any insights 

In [None]:
tweets_df.tweet_created= tweets_df.tweet_created.apply(pd.to_datetime).dt.date

In [None]:
temp_df = tweets_df.groupby(['tweet_created','airline']).airline_sentiment.count().unstack()
ax1 = temp_df.plot(kind='bar', figsize = (15,5))
ax1.set_ylabel('Tweets')
plt.legend(bbox_to_anchor=(1.04,1), loc="upper left")

For American we have the tweets coming in from 22-02-2015 onwards

In [None]:
neg_tweet_df = tweets_df.groupby(['tweet_created','airline','airline_sentiment']).size()
neg_tweet_df = neg_tweet_df.loc(axis=0)[:,:,'negative']
ax2 = neg_tweet_df.groupby(['tweet_created','airline']).sum().unstack().plot(kind='bar', figsize = (15,5), rot=70)
ax2.set_ylabel('Negative Tweets')
plt.legend(bbox_to_anchor=(1.04,1), loc="upper left")

In [None]:
sns.set_style("whitegrid")
ax = sns.barplot(x="airline_sentiment", y="airline_sentiment_confidence", data=tweets_df)

In [None]:
tweets_df.negativereason.value_counts()

In [None]:
tweets_df.negativereason.value_counts().plot(kind='bar', figsize=(15,5))

As we can see majority tweets have said the reason as 
1. Customer servicec issue
2. Late flight

In [None]:
plt.figure(figsize=(15, 4))
sns.set(font_scale = 1.1)
sns.set_style("whitegrid")
ax = sns.barplot(x="negativereason", y="negativereason_confidence", data=tweets_df)
plt.xticks(rotation=70)

In [None]:
from wordcloud import WordCloud,STOPWORDS
def createWrdCloudForSentiment(sentiment):
    temp_df = tweets_df[tweets_df.airline_sentiment==sentiment]
    words = " ".join(temp_df.text)
    cleaned_words = " ".join([w for w in words.split()
                                  if 'http' not in w
                                    and not w.startswith('@')
                                    and w!='RT'])

    wrdcld = WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      width=1500,
                      height=1000).generate(cleaned_words)
    plt.figure(figsize=(10,10))
    plt.imshow(wrdcld)
    plt.axis('off')
    plt.show

In [None]:
createWrdCloudForSentiment('negative')

we observe that 'flight', 'hour', 'hrlp', 'time' 'hold', 'bag', 'plane' are present more frequently in negative statements. 

In [None]:
createWrdCloudForSentiment('positive')

we observe that 'thank', 'flight', 'great', 'will', 'awesome' 'love' are present more frequently in positve statements.

we'll do more detailed analysis below

In [None]:
tweets_df.columns

# Let us start analysing the tweets, 
## We'll now clean up the text data

for this we'll follow the below steps:
1. Remove all the special characters
2. convert all letters to lower case
3. filter out english stop words
4. stemmer (optional)

In [None]:
tweets_df.text

In [None]:
nltk.download('stopwords')
eng_stops = set(stopwords.words("english"))

In [None]:
#nltk.download('wordnet')

In [None]:
## We'll check latter if stemmer will make any difference
#from nltk.stem.porter import PorterStemmer
#stemmer = PorterStemmer()
#
#from nltk.stem import WordNetLemmatizer 
#lemmatizer = WordNetLemmatizer() 

In [None]:
def process_message(tweet):
    # remove all the special characters
    new_tweet = re.sub("[^a-zA-Z]", " ",tweet) 
    # convert all letters to lower case
    words = new_tweet.lower().split()
    # remove stop words
    words = [w for w in words if not w in eng_stops]        
    # stemming
    #words = [stemmer.stem(word) for word in words]
    # lemmatizer
    #words = [lemmatizer.lemmatize(word) for word in words]
    # join all words back to text
    return (" ".join(words))

In [None]:
tweets_df['clean_tweet']=tweets_df['text'].apply(lambda x: process_message(x))

In [None]:
tweets_df['clean_tweet'].to_list()

## Make test-train split

In [None]:
train_df, test_df = train_test_split(tweets_df, test_size=0.3, random_state=42)

In [None]:
train_tweets =[]
for tweet in train_df.clean_tweet:
    train_tweets.append(tweet)
    
test_tweets =[]
for tweet in test_df.clean_tweet:
    test_tweets.append(tweet)

## TF-IDF

In [None]:
# bag of words model
vectorizer = TfidfVectorizer()
train_tfidf_model = vectorizer.fit_transform(train_tweets)
test_tfidf_model = vectorizer.transform(test_tweets)

In [None]:
# let's look at the dataframe
train_tfidf = pd.DataFrame(train_tfidf_model.toarray(), columns=vectorizer.get_feature_names())
train_tfidf

In [None]:
print(vectorizer.get_feature_names())

# Now we''ll apply model to predicit sentiments from tweet text data

In [None]:
cls = [LogisticRegression(),
       MultinomialNB(), 
       DecisionTreeClassifier(),
       RandomForestClassifier(n_estimators=200),
       KNeighborsClassifier(n_neighbors = 5)]

cls_name = []

In [None]:
lbl_actual = test_df.airline_sentiment
i = 0
accuracy = []
for cl in cls:
    model = cl.fit(train_tfidf_model,train_df.airline_sentiment)
    lbl_pred = model.predict(test_tfidf_model)
    a = (100*accuracy_score(lbl_pred, lbl_actual))
    a = round(a,2)
    accuracy.append(a)
    cls_name.append(cl.__class__.__name__)
    print ("{}  Accuracy Score : {}%".format(cls_name[i],a))
    print ( classification_report(lbl_pred, lbl_actual))
    i +=1

In [None]:
plt.bar(cls_name, accuracy)
plt.xticks(rotation=70)

## Output

In [None]:
# Save to csv

lg_model = LogisticRegression().fit(train_tfidf_model,train_df.airline_sentiment)
lg_lbl_pred = model.predict(test_tfidf_model)

In [None]:
lg_lbl_pred_df = pd.DataFrame({'tweet_id': test_df.tweet_id,
                            'text' : test_df.text,
                            'lg_reg' : lg_lbl_pred})
lg_lbl_pred_df.head()

In [None]:
lg_lbl_pred_df.to_csv('sentiments.csv', index=False)

Conclusion

Logistic regression is better model for predicting the results

Reference: 
https://www.kaggle.com/jiashenliu/how-can-we-predict-the-sentiment-by-tweets