In [1]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import string
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('Tweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
for i in df['airline'].unique():
    print(i)
    print(df[df["airline"] == i].airline_sentiment.value_counts())
    print('\n')

Virgin America
negative    181
neutral     171
positive    152
Name: airline_sentiment, dtype: int64


United
negative    2633
neutral      697
positive     492
Name: airline_sentiment, dtype: int64


Southwest
negative    1186
neutral      664
positive     570
Name: airline_sentiment, dtype: int64


Delta
negative    955
neutral     723
positive    544
Name: airline_sentiment, dtype: int64


US Airways
negative    2263
neutral      381
positive     269
Name: airline_sentiment, dtype: int64


American
negative    1960
neutral      463
positive     336
Name: airline_sentiment, dtype: int64




Based on the output above i create a new dataframe for Southwest only. The data seems to have an issue because tweets classified as Delta are addressing JetBlue. For that reason i chose not to use the Delta subset. Virgin America has a good distribution, but also has a very limited number of tweets. For these reasons i chose Southwest as this subset has the least skewed distribution of positive/negative/neutral tweets (after Virgin America and Delta/JetBlue) while still having a resonable number of tweets. Tweets were also re-indexed starting at 0.

In [4]:
single_airline = pd.DataFrame(df[df['airline'] == 'Southwest'])
single_airline.reset_index(drop=True, inplace=True)
single_airline.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570309156290367488,negative,1.0,longlines,0.6624,Southwest,,thisradlove,,0,@SouthwestAir still waiting. Just hit one hour.,,2015-02-24 11:47:53 -0800,Today I'm in: Maryland,Atlantic Time (Canada)
1,570309145276125185,negative,0.6361,Cancelled Flight,0.6361,Southwest,,tomcblock,,0,@SouthwestAir although I'm not happy you Cance...,,2015-02-24 11:47:50 -0800,"ÜT: 38.965477,-77.428287",Eastern Time (US & Canada)
2,570307615189835777,negative,1.0,Customer Service Issue,1.0,Southwest,,cindyjwhitaker,,0,@SouthwestAir Hello - been on hold for extreme...,,2015-02-24 11:41:45 -0800,,Central Time (US & Canada)
3,570306086475075585,neutral,0.6443,,,Southwest,,liveseasoned,,0,@SouthwestAir I'm teaching new #travelers how ...,,2015-02-24 11:35:41 -0800,,
4,570305647759265793,negative,1.0,Customer Service Issue,1.0,Southwest,,cindyjwhitaker,,0,@SouthwestAir Very frustrated for the loooooon...,,2015-02-24 11:33:56 -0800,,Central Time (US & Canada)


Here i check for missing data and i can see there are no null values.

In [5]:
single_airline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2420 entries, 0 to 2419
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      2420 non-null   int64  
 1   airline_sentiment             2420 non-null   object 
 2   airline_sentiment_confidence  2420 non-null   float64
 3   negativereason                1186 non-null   object 
 4   negativereason_confidence     1445 non-null   float64
 5   airline                       2420 non-null   object 
 6   airline_sentiment_gold        8 non-null      object 
 7   name                          2420 non-null   object 
 8   negativereason_gold           5 non-null      object 
 9   retweet_count                 2420 non-null   int64  
 10  text                          2420 non-null   object 
 11  tweet_coord                   160 non-null    object 
 12  tweet_created                 2420 non-null   object 
 13  twe

Here i am just doing a check to see that i have the correct subset before proceeding.

In [6]:
single_airline['text']

0         @SouthwestAir still waiting. Just hit one hour.
1       @SouthwestAir although I'm not happy you Cance...
2       @SouthwestAir Hello - been on hold for extreme...
3       @SouthwestAir I'm teaching new #travelers how ...
4       @SouthwestAir Very frustrated for the loooooon...
                              ...                        
2415    @SouthwestAir won't answer their phones #Horri...
2416    @SouthwestAir We have been stuck in SJU for se...
2417               @SouthwestAir nice work on the update!
2418    @SouthwestAir you guys there? Are we on hour 2...
2419    @SouthwestAir its cool that my bags take a bit...
Name: text, Length: 2420, dtype: object

It is clear the data still needs to be cleaned. i opted to do this manually as it was still fairly simple and it can be hard to speak to exactly what is happening when utilizing the tools available for data cleaning. The code below iterates through each tweet in the data set. First i use regex to remove retweets, http links, bitly links, and emojis, digits, stop words and such words has length less than 3. These items need to be removed as some are completely unique (links), and retweets could generate duplicate data when stripped down to text. Emojis were removed as it can be hard to understand intention.

This resource was used to aid in removing emojis: https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python

Next, the string is split into individual words, then all punctuation and special characters are removed. Following that, all characters are converted to lowercase. Finally, the words are joined back into a string.

The following resource was used to help figure out how: https://machinelearningmastery.com/clean-text-machine-learning-python/

In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words=stopwords.words('english')

table = str.maketrans('', '', string.punctuation)

#remove emojis
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

for i, value in single_airline['text'].items():
    value = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', value)  # remove re-tweet
    value = re.sub("@[A-Za-z0-9_]+","", value)
    value = re.sub(r'http\S+', '', value)   # remove http links
    value = re.sub(r'bit.ly/\S+', '', value)  # remove bitly links
    value=re.sub('[^a-zA-Z]',' ',value) #remove any character that IS NOT a-z OR A-Z
    value = emoji_pattern.sub(r'', value)
    value = value.split()
    value =  ["" if t.isdigit() else t for t in value] #remove digits
    value = [w for w in value if not w in stop_words]
    value = [w.translate(table) for w in value]
    value = [w.lower() for w in value]
    single_airline['text'][i] = ' '.join(word for word in value if len(word) > 2) #keep words of length greater than 2

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
X = single_airline.text

Print the cleaned tweets to see if they have been cleaned properly. This isn't diffcult, but i leaned on the following resource for many of the following steps:

https://twitterdev.github.io/do_more_with_twitter_data/clustering-users.html

This is a developer resource directly from twitter.

In [9]:
for i, value in enumerate(X):
    print(i,': ', value.replace('\n',' '))

0 :  still waiting just hit one hour
1 :  although happy cancelled flighted flight home tomorrow phx atl dca happy easy rebook
2 :  hello hold extremely long time have confirmation amp get boarding pass have tried numerous times
3 :  teaching new travelers research budget amp save trip today
4 :  very frustrated loooooong wait time speak live person cannot get boarding pass flight tomorrow
5 :  still update text amp still response email feel like losing customer service care
6 :  agents ones rude unhelpful prompted initial tweet this easy fix
7 :  wife group prev flight got bumped reason alone two kids least put group
8 :  flight delayed hrs months pregnant amp supposed caring mom whose surgery very upset
9 :  thank much completely made things right
10 :  love always get best deals
11 :  follow send info
12 :  been hold min far trying book seat infant price increased meantime what
13 :  glassdoor best places work snagging overall notsurprising luvthem
14 :  poor performance around paid

In [10]:
print(single_airline.airline_sentiment.value_counts())

negative    1186
neutral      664
positive     570
Name: airline_sentiment, dtype: int64


## 5 most frequent words (not counting stopwords) in positive and Negative tweets

Here,i just created a new column with a list of tokenized words for each tweet text

In [11]:
single_airline['text_split'] = single_airline['text'].apply(lambda x: x.split(" "))

Now i have created a corpus for positive and negative tweet sentiment Now that i have tokenized the tweet text, i created lists containing words in all the positive and negative tweet text. So that positive tweet words and negative tweet words can be differentiated. For this, i am using use itertools to chain together all the positive and negative tweet text in single lists. The below link is followed

https://datascienceparichay.com/article/python-frequency-of-each-word-in-string/?fbclid=IwAR2zR3WYsLuFFlNLv7V6S9niWdoOnCXzeg3TxSp6FloqH97ufFV12pxYIx4


In [12]:
import itertools
# positive tweets
positive_reviews = single_airline[single_airline['airline_sentiment']=='positive']['text_split']
print("Total positive tweets: ", len(positive_reviews))
positive_reviews_words = list(itertools.chain(*positive_reviews))
print("Total words in positive tweets:", len(positive_reviews_words))
# negative negatives
negative_reviews = single_airline[single_airline['airline_sentiment']=='negative']['text_split']
print("Total negative tweets: ", len(negative_reviews))
negative_reviews_words = list(itertools.chain(*negative_reviews))
print("Total words in negative tweets:", len(negative_reviews_words))

Total positive tweets:  570
Total words in positive tweets: 4116
Total negative tweets:  1186
Total words in negative tweets: 11325


Now i have estimated the word frequency in the corpus Then i have figued out the frequency of each word in the positive and the negative corpus. For this, i am using collections. As collection has Count function which is helpful.Counter that returns an object which is essentially a dictionary with word to frequency mappings. It is useful tool for counting frequncy of word from corpus thats why i am using this. 

In [13]:
import collections
positive_words_frequency = collections.Counter(positive_reviews_words)

negative_words_frequency = collections.Counter(negative_reviews_words)


I have created a dataframe each for the top 5 most frequent words in positive and negative corpuses. It helps me to plot a horizontal bar chart of the 5 most frequent words in both positive and negative the corpuses. With this dataframe it is easier to visualize frequent word.

In [14]:
positive_freq_words_df = pd.DataFrame(positive_words_frequency.most_common(5),
                                     columns=["Word", "Frequency"])
print("5 most  frequent words in Positive tweet:\n")
print(positive_freq_words_df)

5 most  frequent words in Positive tweet:

     Word  Frequency
0  thanks        126
1   thank        117
2  flight         92
3   great         57
4    love         48


In [15]:
negative_freq_words_df = pd.DataFrame(negative_words_frequency.most_common(5),
                                     columns=["Word", "Frequency"])
print("5 most  frequent words in Negative tweet:\n")
print(negative_freq_words_df)

5 most  frequent words in Negative tweet:

        Word  Frequency
0     flight        408
1  cancelled        203
2       hold        151
3        get        136
4  flightled         98


## 5 most frequent words (not counting stopwords) in each identified cluster

I have created an instance of the CountVectorizer class and then Call the fit_transform() function in order to learn a vocabulary from one or more documents and then encode each as a vector. I set max features to 1000 so only the top 1000 most frequest words are used and then removed stop words. Here i define and fit clustering model with the ideal number of clusters based on the elbow graph (showed in clustering tool). Then a new column named cluster was created for the predicted value of cluster sothat i can get the cluster words easily from dataframe. Then i set a for loop according to the number of cluster. Rest of the process is similar for counting most frequent word in cluster as described above for counting positive and negative most frequent word. 

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
num_clusters=6
bow_vectorizer = CountVectorizer(max_features=1000, stop_words = 'english')
tweet_matrix = bow_vectorizer.fit_transform(X)
kmeans = KMeans(n_clusters=num_clusters, max_iter=1000)
kmeans.fit_predict(tweet_matrix)

single_airline['cluster'] = kmeans.fit_predict(tweet_matrix) # creating a new column of cluster

In [17]:
for i in range(num_clusters):
    cluster_one= single_airline[single_airline['cluster']==i]['text_split']
   
    cluster_one_words = list(itertools.chain(*cluster_one))
     
    cluster_one_words_frequency = collections.Counter(cluster_one_words)
    
    
    cluster_one_freq_words_df = pd.DataFrame(cluster_one_words_frequency.most_common(5),
                                     columns=["Word", "Frequency"])
    print("Cluster", i ,":5 most frequent words\n")
    print(cluster_one_freq_words_df)
    print("\n")

Cluster 0 :5 most frequent words

      Word  Frequency
0      get        144
1   thanks        139
2    thank        119
3  flights        101
4     help         86


Cluster 1 :5 most frequent words

     Word  Frequency
0  flight        395
1     get         42
2    late         36
3    time         35
4  thanks         33


Cluster 2 :5 most frequent words

        Word  Frequency
0     flight        203
1  cancelled        195
2  flightled        106
3       hold         41
4   flighted         37


Cluster 3 :5 most frequent words

        Word  Frequency
0    service        122
1   customer        112
2        amp         14
3       hold         14
4  relations         11


Cluster 4 :5 most frequent words

        Word  Frequency
0       date          2
1      first          1
2       time          1
3      flyer          1
4  scheduled          1


Cluster 5 :5 most frequent words

      Word  Frequency
0     hold        100
1     hour         34
2    hours         21
3  minut