# Problem : Sentiment Analysis of US Airline Tweets

**#import the Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re


In [None]:
data_file = '../input/twitter-airline-sentiment/Tweets.csv'

In [None]:
data = pd.read_csv(data_file)

# **Exploratory Data Analysis**

In [None]:

data.head()

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.columns

**#Finding if there is any missing value**

In [None]:
data.isnull()

In [None]:
data.isnull().sum()

**#Visualizations**

In [None]:
sns.heatmap(data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
data.airline.value_counts().plot(kind='pie',autopct='%1.0f')

In [None]:
data.airline_sentiment.value_counts().plot(kind='pie',autopct='%1.0f')

In [None]:
sns.countplot(x='airline_sentiment',data=data,palette='viridis')

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(x='airline',hue='airline_sentiment',data=data,palette='rainbow')

In [None]:
sns.barplot(x='airline_sentiment',y='airline_sentiment_confidence',data=data,palette='viridis')

In [None]:
sns.boxplot(x='airline',y='airline_sentiment_confidence',data=data)

# **Data Cleaning**

**1. Cleaning the text data**

In [None]:
Features = data.iloc[:,10].values
Labels = data.iloc[:,1].values

In [None]:
processed_Features = []

for sentence in range(0,len(Features)):
    #remove all the special character
    processed_Feature = re.sub(r'\W',' ',str(Features[sentence]))

    #remove all single characters
    processed_Feature = re.sub(r'\s+[a-zA-Z]\s+',' ',processed_Feature)

    #remove single characters from the start
    processed_Feature = re.sub(r'\^[a-zA-Z]\s+',' ',processed_Feature)

    #substituting multiple spaces with single space
    processed_Feature = re.sub(r'\s+',' ',processed_Feature,flags=re.I)

    #Removing prefixed 'b'
    processed_Feature = re.sub(r'^b\s+', ' ',processed_Feature)

    #converrting into lowercase
    processed_Feature = processed_Feature.lower()
    processed_Features.append(processed_Feature)

**TF-IDF**

In [None]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
!pip install --user -U nltk
import nltk
nltk.download('stopwords')

In [None]:
vectorizer = TfidfVectorizer(max_features=2500,min_df=7,max_df=0.8,stop_words=stopwords.words('english'))
processed_Features = vectorizer.fit_transform(processed_Features).toarray()

**Train-Test Split**

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(processed_Features,Labels,test_size=0.2,random_state=0)

In [None]:
import sys
print(sys.version)

# **Build Classifification Models And Error Analysis Comaparison**

**1. Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200,random_state=0)
text_classifier.fit(x_train,y_train)

In [None]:
predictions = text_classifier.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score

print(confusion_matrix(y_test,predictions))

In [None]:
print('accuracy score',accuracy_score(y_test,predictions))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

**2.Support Vector Machine**

In [None]:
from sklearn.svm import SVC

In [None]:
clf = SVC(kernel='linear',random_state=1)
clf.fit(x_train,y_train)

In [None]:
predictions = clf.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score

print(confusion_matrix(y_test,predictions))

In [None]:
print('accuracy score',accuracy_score(y_test,predictions))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

3. **Naive Bayes Classifier**

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
classifier = MultinomialNB()
classifier.fit(x_train,y_train)

In [None]:
predictions = classifier.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score

print(confusion_matrix(y_test,predictions))


In [None]:
print('accuracy score',accuracy_score(y_test,predictions))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

**Based on the comparison and error analysis of 3 models such as Random Forest Classifier, Support Vector Machine and Naive Bayes Classifier, We can clearly see that Support Vector Machine(SVM) is the best among call. **