In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

In [3]:
import nltk   # Importing NLP Tool Kit
import string   #For getting punctuations from a string
from nltk.corpus import stopwords  # For getting stopwords like 'the,in,been...'
import re  # For Regular Expressions

In [4]:
tweet_data = pd.read_csv('Tweet_Sentiment_Analysis_test.csv',index_col=0)

In [5]:
tweet_data.head() #Getting the head of the data we have read from csv

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
2,0,Finally a transparant silicon case ^^ Thanks t...
3,0,We love this! Would you go? #talk #makememorie...
4,0,I'm wired I know I'm George I was made that way
5,1,What amazing service! Apple won't even talk to...


In [6]:
tweet_data.info() #Getting info of the data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 666 entries, 1 to 666
Data columns (total 2 columns):
label    666 non-null int64
tweet    666 non-null object
dtypes: int64(1), object(1)
memory usage: 15.6+ KB


In [7]:
def text_processor(tweet):
    """
    Tokenization(Splitting a sentence into list of words) of the tweet
      1. Removing URLs from the tweet
      2. Removing Punctuations from the tweet
      3. Removing stopwords (the,he,she,it)
    """
    no_url_tweet = re.sub('http\S+',"", tweet) #URL Removing
    no_url_tweet2 = re.sub('twitter\S+','', no_url_tweet)
    no_punc_tweet = [char for char in no_url_tweet2 if char not in string.punctuation] #Removing Punctuations
    #After removing Punctuations,no_punc_tweet is a list of characters...So we make it as words using .join()
    #method and store that words list in no_punc_tweet
    no_punc_tweet = ''.join(no_punc_tweet)
    #Now we are using stopwords module from NLTK package to remove StopWords like (i,me,my,myself,we,our,etc...)
    cleaned_tweet = [word for word in no_punc_tweet.split() if word.lower() not in stopwords.words('english')]
    return cleaned_tweet

In [8]:
# Applying the text_processor() method on all tweets for checking...
tweet_data['tweet'].apply(text_processor).head()

id
1    [fingerprint, Pregnancy, Test, android, apps, ...
2    [Finally, transparant, silicon, case, Thanks, ...
3    [love, Would, go, talk, makememories, unplug, ...
4             [Im, wired, know, Im, George, made, way]
5    [amazing, service, Apple, wont, even, talk, qu...
Name: tweet, dtype: object

In [9]:
# Vectorization - Making a list of words to a vector which Machine Learning Algorithm can understand

    #1. Count no. of times a word occured in a tweet(Also Known As 'Term Frequency')
    #2. Knowing how important the word is by using IDF(Inver Document Frequency)
    #3. Normalize the vectors to unit length, to abstract from the original text length

# We are using scikit-learn's CountVectorizer()

from sklearn.feature_extraction.text import CountVectorizer

In [10]:
#To Transform each word into a vector we are initialising an instance for CountVectorizer
bow_transformer = CountVectorizer(analyzer=text_processor).fit(tweet_data['tweet'])

In [11]:
# Checking which word is having the 2220 index
print(bow_transformer.get_feature_names()[2220])

iPad


In [12]:
# To know that how many words are there in our whole data
print(len(bow_transformer.vocabulary_))

3816


In [13]:
# Transforming each word into a vector by using the instance of CountVectorizer
tweets_bow = bow_transformer.transform(tweet_data['tweet'])

In [14]:
# TF-IDF  from scikit-learn
# To know how important that word is for our analysis

from sklearn.feature_extraction.text import TfidfTransformer

In [15]:
# Creating an instance for TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(tweets_bow)

In [16]:
# Applying TfidfTransformer to all the tweets
tweets_tfidf = tfidf_transformer.transform(tweets_bow)

In [17]:
#Now, we use Naive bayes Algorithm to classify

from sklearn.naive_bayes import MultinomialNB

In [18]:
pos_neg_model = MultinomialNB().fit(tweets_tfidf, tweet_data['label'])

In [19]:
all_pred = pos_neg_model.predict(tweets_tfidf)

In [20]:
# Importing train_test_split
from sklearn.model_selection import train_test_split

In [21]:
tweet_train, tweet_test, label_train, label_test = train_test_split(tweet_data['tweet'], tweet_data['label'], test_size=0.3)

In [26]:
# We are using pipeline method from sklearn.pipeline for simplicity
# By using pipeline we can do Vectorization, TfidfTransformation, Classification of data by a model...
# All can be done in 1 step without doing all Manually
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_processor)),
    ('tfidf', TfidfTransformer()),
    ('Classifier', LogisticRegression())
])

pipeline.fit(tweet_train, label_train)



Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_processor at 0x7fe9b0e2e510>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [27]:
# Making Prediction
pipeline_pred = pipeline.predict(tweet_test)

In [28]:
# Knowing the Classification Report to know the Accuracy of our prediction
from sklearn.metrics import classification_report, accuracy_score

print("Classification Report:\n", classification_report(label_test, pipeline_pred))
print('Accuracy Score:\n\t', accuracy_score(label_test, pipeline_pred)*100)

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.99      0.88       156
           1       0.60      0.07      0.12        44

   micro avg       0.79      0.79      0.79       200
   macro avg       0.69      0.53      0.50       200
weighted avg       0.75      0.79      0.71       200

Accuracy Score:
	 0.785


In [None]:
pipeline_pred