# Natural Language Processing Project

Objective is to classify the sentiment of tweets from customers about a US Airline.

#### Importing libraries:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string 
import unicodedata                                    
from bs4 import BeautifulSoup 
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')                                           
from nltk.corpus import stopwords                       
from nltk.tokenize import word_tokenize, sent_tokenize  
from nltk.stem.wordnet import WordNetLemmatizer 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rohanenagala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rohanenagala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rohanenagala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
tweets_data = pd.read_csv('Tweets.csv')

#### Data summary:

In [3]:
tweets_data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
tweets_data.shape

(14640, 15)

In [5]:
tweets_data.info

<bound method DataFrame.info of                  tweet_id airline_sentiment  airline_sentiment_confidence  \
0      570306133677760513           neutral                        1.0000   
1      570301130888122368          positive                        0.3486   
2      570301083672813571           neutral                        0.6837   
3      570301031407624196          negative                        1.0000   
4      570300817074462722          negative                        1.0000   
...                   ...               ...                           ...   
14635  569587686496825344          positive                        0.3487   
14636  569587371693355008          negative                        1.0000   
14637  569587242672398336           neutral                        1.0000   
14638  569587188687634433          negative                        1.0000   
14639  569587140490866689           neutral                        0.6771   

               negativereason  negativereas

#### Dropping unnecessary columns:

In [6]:
tweets = tweets_data.drop(columns=['tweet_id', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone'])

In [7]:
tweets.shape

(14640, 2)

In [8]:
tweets.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


#### Data pre-processing:

In [9]:
#html tag removal
def html_tag_removal(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [10]:
#tokenization
def tokenization(text):
    return nltk.word_tokenize(text)

In [11]:
#remove the numbers
def remove_numbers(text):
  text = re.sub(r'\d+', '', str(text))
  return text

In [12]:
#removal of special characters and punctuations
def special_character_remover(words):
    new_words = []
    for word in words:
        special_character = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(special_character)
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        rempunc_word = re.sub(r'[^\w\s]', '', word)
        if rempunc_word != '':
            new_words.append(rempunc_word)
    return new_words

In [13]:
#conversion to lowercase
def lowercase(words):
    new_words = []
    for word in words:
        lower_word = word.lower()
        new_words.append(lower_word)
    return new_words

In [14]:
#remove stopwords
stopwords = stopwords.words('english')

customlist = ['not', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
        "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
        "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
        "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

stopwords = list(set(stopwords) - set(customlist))                              

def stopwords_removal(words):
    new_words = []
    for word in words:
        if word not in stopwords:
            new_words.append(word)
    return new_words

In [15]:
#lemmatize or stemming
lemmatizer = WordNetLemmatizer()

def lemmatize(words):
    new_words = []
    for word in words:
      new_words.append(lemmatizer.lemmatize(word, pos='v'))
    return new_words

In [16]:
def change_to_words(text):
  return tokenization(remove_numbers(html_tag_removal(text)))

In [17]:
def normalize(words):
    words = lemmatize(stopwords_removal(remove_punctuation(lowercase(special_character_remover(words)))))
    return ' '.join(words)

In [18]:
tweets['text'] = tweets.apply(lambda row: normalize(change_to_words(row['text'])), axis=1)

In [19]:
tweets.head()

Unnamed: 0,airline_sentiment,text
0,neutral,virginamerica dhepburn say
1,positive,virginamerica plus add commercials experience ...
2,neutral,virginamerica nt today must mean need take ano...
3,negative,virginamerica really aggressive blast obnoxiou...
4,negative,virginamerica really big bad thing


#### Vectorization:

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1000)
count_data_features = vectorizer.fit_transform(tweets['text'])

count_data_features = count_data_features.toarray()

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
tfidf_data_features = vectorizer.fit_transform(tweets['text'])

tfidf_data_features

<14640x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 117434 stored elements in Compressed Sparse Row format>

In [22]:
labels = tweets['airline_sentiment']

#### Modelling:

In [23]:
#count vectorization
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(count_data_features, labels, test_size=0.3, random_state=50)

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest = RandomForestClassifier(n_estimators=10, n_jobs=1)

forest = forest.fit(X_train, y_train)
forest.score(X_test, y_test)

0.7352003642987249

In [25]:
#tf-idf vectorization
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_data_features, labels, test_size=0.3, random_state=50)

In [26]:
forest = RandomForestClassifier(n_estimators=10, n_jobs=1)

forest = forest.fit(X_train, y_train)
forest.score(X_test, y_test)

0.7420309653916212