In [None]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd,numpy

#Data Importing

In [None]:
df=pd.read_csv("data.csv")


In [None]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [None]:
df["Sentiment"].value_counts()

neutral     3130
positive    1852
negative     860
Name: Sentiment, dtype: int64

#Data Cleaning

In [None]:
#clean the data by doing removing stop words, punctuations and apply lemmatization
import string
punct = string.punctuation

In [None]:
#positive feedback
from spacy.lang.en.stop_words import STOP_WORDS as SW
import spacy
nlp = spacy.load('en_core_web_sm')
stopwords = list(SW)

In [None]:
#method for cleaning the data by removing the punctuations and stopwords
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-": #to check wheather the token is a pronoun or not?
            temp = token.lemma_.lower().strip()
        else:
            temp=token.lower_  
        tokens.append(temp)
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [None]:
text_data_cleaning("The GeoSolutions technology will leverage Benefon 's GPS solutions by providing Location Based Search Technology , a Communities Platform , location relevant multimedia content and a new and powerful commercial model")

['geosolutions',
 'technology',
 'leverage',
 'benefon',
 'gps',
 'solution',
 'provide',
 'location',
 'based',
 'search',
 'technology',
 'communities',
 'platform',
 'location',
 'relevant',
 'multimedia',
 'content',
 'new',
 'powerful',
 'commercial',
 'model']

##DATA SET PREPARATION TRAINING AND TEST

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
x_train,x_test,y_train,y_test = train_test_split(df['Sentence'],df['Sentiment'],test_size=0.3,random_state = 0,shuffle = True)

In [None]:
y_test

2027    positive
1160     neutral
4769    positive
1511    positive
4800    positive
          ...   
5565     neutral
1824     neutral
3213    positive
2385     neutral
4362    positive
Name: Sentiment, Length: 1753, dtype: object

##Encoding the actual output value into numericals like...


0.   negative
1.   neutral
2.   positive



In [None]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()#this will help us to do it.
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)


#Feature Engineering
raw text data will be transformed into feature vectors and new features will be created using the existing dataset

In [None]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df["Sentence"])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(x_train)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(x_test)

#Training Model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


In [None]:
tfidf = TfidfVectorizer(tokenizer= text_data_cleaning,strip_accents=None,lowercase=False,preprocessor=None)

In [None]:
lr_tfidf = Pipeline([('vect', tfidf),
                     ('classifier', LogisticRegression(random_state=0))])

In [None]:
lr_tfidf.fit(x_train,y_train)


Pipeline(steps=[('vect',
                 TfidfVectorizer(lowercase=False,
                                 tokenizer=<function text_data_cleaning at 0x7fa3f4f590e0>)),
                ('classifier', LogisticRegression(random_state=0))])

#MODEL CREATION AND TRAINING COMPLETED

#LET's Predict the model

In [None]:
#to predict we have a predict method in sklearn
y_pred = lr_tfidf.predict(x_test)

print(y_pred[0])
for i in range(5):
  print("--->test data--sentiment-->",y_pred[i])

2
--->test data--sentiment--> 2
--->test data--sentiment--> 1
--->test data--sentiment--> 1
--->test data--sentiment--> 2
--->test data--sentiment--> 1


In [None]:
#to compare or check the predicted data we have to import these
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [None]:
accuracy_score(y_test,y_pred)

0.677124928693668

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.35      0.12      0.18       283
           1       0.68      0.88      0.77       940
           2       0.73      0.62      0.67       530

    accuracy                           0.68      1753
   macro avg       0.59      0.54      0.54      1753
weighted avg       0.64      0.68      0.64      1753



#Let's Check on Actual dataset we have...

In [None]:
df.head(10)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
5,$SPY wouldn't be surprised to see a green close,positive
6,Shell's $70 Billion BG Deal Meets Shareholder ...,negative
7,SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANG...,negative
8,Kone 's net sales rose by some 14 % year-on-ye...,positive
9,The Stockmann department store will have a tot...,neutral


In [None]:
#top_sents = df["Sentence"].head(10)
top_sents= ["Hulu has a great UI"]

#lets predit these outputs
prd = lr_tfidf.predict(top_sents)
for i in range(len(prd)):
  print("Sentence : ",top_sents[i],"\nSentiment : ",prd[i])
  

Sentence :  Hulu has a great UI 
Sentiment :  2


In [None]:
import pickle
with open('Sentiment_analysis_lr.pkl', 'wb') as handle:
  pickle.dump(lr_tfidf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#to check the imported model from the pickle file.
with open('Sentiment_analysis_lr.pkl', 'rb') as handle:
    b = pickle.load(handle)
#lets test the data and find the accuracy of the model using this
y_pred = lr_tfidf.predict(x_test)
print(accuracy_score(y_test,y_pred))

0.677124928693668
