## Airline Tweets pre-processing

In [1]:
# Importing libraries
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import nltk
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer

import re
import glob
import os
import pandas as pd
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import spacy
from spacy.lang.en import English
from nltk import word_tokenize, pos_tag, ne_chunk
from autocorrect import Speller
from nltk.stem import PorterStemmer
from textblob import TextBlob

In [2]:
# Loading all the fetched data to pandas dataframe
df = pd.read_csv('final_train_df.csv',encoding = "ISO-8859-1")

# Shuffling 
df_shuffled=df.sample(frac=1).reset_index(drop=True)

df.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,username,text,airline_name,airline_sentiment,tweet_location
0,0,5.70306e+17,cairdin,@VirginAmerica What @dhepburn said.,Virgin America,neutral,
1,1,5.70301e+17,jnardino,@VirginAmerica plus you've added commercials t...,Virgin America,positive,
2,2,5.70301e+17,yvonnalynn,@VirginAmerica I didn't today... Must mean I n...,Virgin America,neutral,Lets Play
3,3,5.70301e+17,jnardino,@VirginAmerica it's really aggressive to blast...,Virgin America,negative,
4,4,5.70301e+17,jnardino,@VirginAmerica and it's a really big bad thing...,Virgin America,negative,


In [3]:
positive_df = df[df['airline_sentiment']=='positive']
neutral_df = df[df['airline_sentiment']=='neutral']
negative_df = df[df['airline_sentiment']=='negative']

### Initial pre-processing

In [4]:
# Dropping duplicate tweets
df = df.drop_duplicates(['text'])

if 'Unnamed: 0' and 'tweet_id' in df.columns: 
    df = df.drop(['Unnamed: 0','tweet_id'],axis = 1)

df.head()

Unnamed: 0,username,text,airline_name,airline_sentiment,tweet_location
0,cairdin,@VirginAmerica What @dhepburn said.,Virgin America,neutral,
1,jnardino,@VirginAmerica plus you've added commercials t...,Virgin America,positive,
2,yvonnalynn,@VirginAmerica I didn't today... Must mean I n...,Virgin America,neutral,Lets Play
3,jnardino,@VirginAmerica it's really aggressive to blast...,Virgin America,negative,
4,jnardino,@VirginAmerica and it's a really big bad thing...,Virgin America,negative,


### Removing special characters

In [5]:
def clean(txt):
    txt = txt.replace("()", "")
    txt = txt.replace('(<a).*(>).*()', '')
    txt = txt.replace('(&amp)', '')
    txt = txt.replace('(&gt)', '')
    txt = txt.replace('(&lt)', '')
    txt = txt.replace('(\xa0)', ' ')  
    return txt

df['text'] = df['text'].apply(lambda x: clean(x))
df['text'].head()

0                  @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials t...
2    @VirginAmerica I didn't today... Must mean I n...
3    @VirginAmerica it's really aggressive to blast...
4    @VirginAmerica and it's a really big bad thing...
Name: text, dtype: object

### Extracting all the hastags

In [6]:
df['tweet_hastags'] = df['text'].apply(lambda x: re.findall("#([a-zA-Z0-9_]{1,50})", x))
df['tweet_hastags'].value_counts()

[]                      12909
[DestinationDragons]       70
[fail]                     36
[usairwaysfail]            21
[customerservice]          21
                        ...  
[peanutsonaplatter]         1
[letitgo]                   1
[hotlanta]                  1
[notmadeofmoney]            1
[BlackBerry10]              1
Name: tweet_hastags, Length: 1785, dtype: int64

### Extracting all the mentions

In [7]:
df['mentions'] = df['text'].apply(lambda x: re.findall("@([a-zA-Z0-9_]{1,50})", x))
df['mentions'].value_counts()

[united]                            3370
[USAirways]                         2470
[AmericanAir]                       2281
[SouthwestAir]                      2090
[JetBlue]                           1932
                                    ... 
[SouthwestAir, JasonWhitely]           1
[SouthwestAir, SwagglikeBean]          1
[SouthwestAir, SacIntlAirport]         1
[SouthwestAir, DJQ_KC, djimpact]       1
[JetBlue, hellobrittNEY_]              1
Name: mentions, Length: 1021, dtype: int64

### Removing hastags and mentions from tweets 

In [8]:
def clean_annotation(tweet):
    clean_tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
    clean_tweet = re.sub("#[A-Za-z0-9_]+","", clean_tweet)
    return clean_tweet

df['tweets'] = df['text'].apply(lambda x: clean_annotation(x))
df['tweets'].head()

0                                          What  said.
1     plus you've added commercials to the experien...
2     I didn't today... Must mean I need to take an...
3     it's really aggressive to blast obnoxious "en...
4             and it's a really big bad thing about it
Name: tweets, dtype: object

### Removing HTTP Links 

In [9]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub(r'https?:\/\/\S*', '', x, flags=re.MULTILINE))
df['tweets'].head()

0                                          What  said.
1     plus you've added commercials to the experien...
2     I didn't today... Must mean I need to take an...
3     it's really aggressive to blast obnoxious "en...
4             and it's a really big bad thing about it
Name: tweets, dtype: object

### Converting to Lowercase

In [10]:
df['tweets'] = df['tweets'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['tweets'].head()

0                                           what said.
1    plus you've added commercials to the experienc...
2    i didn't today... must mean i need to take ano...
3    it's really aggressive to blast obnoxious "ent...
4             and it's a really big bad thing about it
Name: tweets, dtype: object

### Removing punctuation

In [11]:
df['tweets'] = df['tweets'].str.replace('[^\w\s]','')
df['tweets'].head()

0                                            what said
1    plus youve added commercials to the experience...
2    i didnt today must mean i need to take another...
3    its really aggressive to blast obnoxious enter...
4              and its a really big bad thing about it
Name: tweets, dtype: object

### De-emojify tweets to sentiments

In [5]:
f = open("../LDA/emoji_regex.txt", "r")
def remove_emoji(text):
    emoji_pattern = re.compile(f.read(), flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

str

In [13]:
df['tweets'] = df['tweets'].apply(lambda x: remove_emoji(x))
df['tweets'].head()

0                                            what said
1    plus youve added commercials to the experience...
2    i didnt today must mean i need to take another...
3    its really aggressive to blast obnoxious enter...
4              and its a really big bad thing about it
Name: tweets, dtype: object

### Lemmatization

In [14]:
import spacy
load_model = spacy.load('en_core_web_sm', disable = ['parser','ner'])

In [15]:
def lemmatize(x):
    doc = load_model(x)
    return " ".join([token.lemma_ for token in doc])

In [16]:
df['tweets'] = df['tweets'].apply(lambda x: lemmatize(x))
df['tweets'].head()

0                                             what say
1    plus you ve add commercial to the experience t...
2    I do not today must mean I need to take anothe...
3    its really aggressive to blast obnoxious enter...
4              and its a really big bad thing about it
Name: tweets, dtype: object

### Part of speech tagging (POS)

In [17]:
from textblob import TextBlob

def pos_tag(x):
    result = TextBlob(x)
    return result.tags

In [18]:
# !python -m textblob.download_corpora

In [19]:
df['tweets_tags'] = df['tweets'].apply(lambda x: pos_tag(x))
df['tweets_tags'].head()

0                             [(what, WP), (say, VBP)]
1    [(plus, CC), (you, PRP), (ve, VBP), (add, VB),...
2    [(I, PRP), (do, VBP), (not, RB), (today, NN), ...
3    [(its, PRP$), (really, RB), (aggressive, JJ), ...
4    [(and, CC), (its, PRP$), (a, DT), (really, RB)...
Name: tweets_tags, dtype: object

### Named Entity recognition

In [20]:
def ner(x):
    wr = word_tokenize(x)
    r = "".join(pos_tag(wr))
    return ne_chunk(r)

In [22]:
# df['ner_tweets'] = df['tweets'].apply(lambda x: ner(x))
# df['ner_tweets'].head()

### Stemming

In [23]:
stemming = PorterStemmer()
df['tweets'] = df['tweets'].apply(lambda x: " ".join([stemming.stem(word) for word in x.split()]))
df['tweets'].head()

0                                             what say
1          plu you ve add commerci to the experi tacki
2    i do not today must mean i need to take anoth ...
3    it realli aggress to blast obnoxi entertain in...
4               and it a realli big bad thing about it
Name: tweets, dtype: object

### Spell Correction

In [24]:
def spell_check(x):
    check = Speller(lang='en')
    return check(x)

In [25]:
df['tweets'] = df['tweets'].apply(lambda x: spell_check(x))
df['tweets'].head()

0                                             what say
1            pl you ve add commerce to the expert tack
2     i do not today must mean i need to take not trip
3    it really address to blast obnoxi entertain in...
4               and it a really big bad thing about it
Name: tweets, dtype: object

In [26]:
df.to_csv('final_df.csv')