# Sentimental Analysis

In [1]:
#pip install nltk

### Importing Libraries

In [2]:
import numpy as np 
import pandas as pd 
import re
import nltk 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Importing Dataset

In [3]:
dataset = pd.read_csv('dataset.csv')
print(dataset.shape)

(14640, 15)


In [4]:
print(dataset.head(5))

             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                    NaN    jnar

Segregating Dataset into Input & Output

In [5]:
print(dataset['airline_sentiment'].isnull())
print(dataset['text'].isnull())

0
0


In [6]:
features = dataset.iloc[:, 10].values
labels = dataset.iloc[:, 1].values
print(features)
print(labels)

['@VirginAmerica What @dhepburn said.'
 "@VirginAmerica plus you've added commercials to the experience... tacky."
 "@VirginAmerica I didn't today... Must mean I need to take another trip!"
 ... '@AmericanAir Please bring American Airlines to #BlackBerry10'
 "@AmericanAir you have my money, you change my flight, and don't answer your phones! Any other suggestions so I can make my commitment??"
 '@AmericanAir we have 8 ppl so we need 2 know how many seats are on the next flight. Plz put us on standby for 4 people on the next flight?']
['neutral' 'positive' 'neutral' ... 'neutral' 'negative' 'neutral']


Removing the Special Character

In [7]:
features_list = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', feature)

    # Remove single characters from the start
    feature = re.sub(r'\^[a-zA-Z]\s+', ' ', feature) 

    # Substituting multiple spaces with single space
    feature = re.sub(r'\s+', ' ', feature, flags=re.I)

    # Removing prefixed 'b'
    feature = re.sub(r'^b\s+', '', feature)

    # Converting to Lowercase
    feature = feature.lower()

    features_list.append(feature)


from text to vector


In [8]:
nltk.download('stopwords')
vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
new_features = vectorizer.fit_transform(features_list).toarray()
print(new_features)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


Splitting Dataset into Train & Test

In [9]:
X_train, X_test, y_train, y_test = train_test_split(new_features, labels, test_size=0.2, random_state=0)

Loading Random Forest Algorithm

In [10]:
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

Predicting the Test data with Trained Model

In [11]:
predictions = text_classifier.predict(X_test)

Accuracy Score of the Model

In [12]:
print(accuracy_score(y_test, predictions))

0.7599043715846995
