### Importing Libraires

In [2]:
import numpy as np
import pandas as pd

In [21]:
import re

In [74]:
import pickle

In [3]:
import matplotlib.pyplot as plt

In [4]:
import seaborn as sns

In [24]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
%matplotlib inline

### Setting Columns name

In [6]:
columns_name=['target','ids','date','flag','user','tweet']
twitter_data = pd.read_csv('twitter_dataset.csv',names=columns_name)

### Head of the Dataset

In [7]:
twitter_data.head()

Unnamed: 0,target,ids,date,flag,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [9]:
twitter_data.replace({'target':{4:1}},inplace=True)

In [13]:
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

### Creating an instance of PorterStemmer for performing stemming on tweet column of twitter_data dataset

In [25]:
port_stem = PorterStemmer()

In [27]:
def stemming(content):
    stemmed_data=re.sub('[^a-zA-Z]',' ',content)
    stemmed_data=stemmed_data.lower()
    stemmed_data=stemmed_data.split()
    stemmed_data=[port_stem.stem(word) for word in stemmed_data if word not in stopwords.words('english')]
    stemmed_data = ' '.join(stemmed_data)

    return stemmed_data

### Performing stemming on every record's tweet

In [28]:
twitter_data['stemmed_data'] = twitter_data['tweet'].apply(stemming)

In [29]:
twitter_data.head()

Unnamed: 0,target,ids,date,flag,user,tweet,stemmed_data
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


### Getting stemmed_data and target values in different variables for training model

In [31]:
X = twitter_data['stemmed_data'].values
Y = twitter_data['target'].values

In [32]:
X

array(['switchfoot http twitpic com zl awww bummer shoulda got david carr third day',
       'upset updat facebook text might cri result school today also blah',
       'kenichan dive mani time ball manag save rest go bound', ...,
       'readi mojo makeov ask detail',
       'happi th birthday boo alll time tupac amaru shakur',
       'happi charitytuesday thenspcc sparkschar speakinguph h'],
      dtype=object)

In [33]:
Y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

### Divinding X and Y values in training and testing datasets

In [66]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.35,stratify=Y,random_state=101)

### Creating an instance of TfidVectorizer to convert text data into numerals for further process

In [59]:
vectorizer = TfidfVectorizer()

### Converting from text to numerals

In [67]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

### Creating instance of Logisticregression model for training our model

In [68]:
model = LogisticRegression(max_iter=1000)

In [69]:
model.fit(X_train,Y_train)

### Checking accuracy of model

In [70]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [71]:
training_data_accuracy

0.8084096153846154

In [72]:
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test,X_test_prediction)

In [73]:
testing_data_accuracy

0.7767089285714286

### Saving the model in a .sav file to use it whenever needed 

In [75]:
pickle.dump(model,open('TwitterSentimentAnalysisModel.sav','wb'))