## **Generate a training Model for Sentiment Analysis using Scikit-learn**

**Importing Required Packages**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from string import punctuation
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score,accuracy_score,make_scorer

**Import and download required nltk packages**

In [None]:
import nltk
# Run this cell to download required packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

**Text Preprocessing Class**

In [None]:
class TextPreprocess:
  port_stemmer = PorterStemmer()
  wordnet_lemmatizer = WordNetLemmatizer()

  def __int__(self):
    pass
  
  def to_lower(self,text):
    text_lower = text.lower()
    return text_lower
  
  def remove_numbers(self,text):
    output = ''.join(word for word in text if not word.isdigit())
    return output
  
  def remove_punctuation(self,text):
    output = ''.join(c for c in text if c not in punctuation)
    return output
  
  def remove_Tags(self,text):
    cleaned_text = re.sub('<[^<]+?>', '', text)
    return cleaned_text
  
  def remove_stopwords(self,sentence):
    stop_words = stopwords.words('english')
    output = ' '.join([w for w in nltk.word_tokenize(sentence) if not w in stop_words])
    return output

  def tokenize_word(self,text):
    output = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    return output

  def stemmer(self, tokenList):
    output = [self.port_stemmer.stem(word) for word in tokenList]
    return output

  def lemmatizer(self,tokenList):
    lemmatized_word = [self.wordnet_lemmatizer.lemmatize(word) for word in tokenList]
    output = " ".join(lemmatized_word)
    return output

  def normalizer(self, text):
    lowerText = self.to_lower(text)
    clean_text = self.remove_punctuation(lowerText)
    clean_text = self.remove_numbers(clean_text)
    clean_text = self.remove_stopwords(clean_text)
    wordTokens = self.tokenize_word(clean_text)
    stemmedList = self.stemmer(wordTokens)
    lemmatizedText = self.lemmatizer(stemmedList)

    return lemmatizedText


**loading csv dataset into pandas dataframe**

In [None]:
df = pd.read_csv('/content/Tweets.csv') # change csv path

In [None]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


**Applying Preprocess class for the dataset**

In [None]:
textProcess = TextPreprocess()

df = shuffle(df)
y = df['airline_sentiment'].astype(str)
x = df.text.apply(textProcess.normalizer)

**Vectorizing the Text column**

In [None]:
vectorizer = CountVectorizer()
x_vectorized = vectorizer.fit_transform(x)

In [None]:
train_x,val_x,train_y,val_y = train_test_split(x_vectorized,y)

**Applying Logistic Regression**

In [None]:
regressor = LogisticRegression(multi_class='multinomial', solver='newton-cg')
model = regressor.fit(train_x, train_y)
regression_accuracy = regressor.score(val_x, val_y) * 100

In [None]:
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5)
gs_clf = gs_clf.fit(train_x, train_y)
model = gs_clf.best_estimator_

In [None]:
val_y.shape

(3660,)

**Checking predictions for previously trained Data**

In [None]:
y_pred = model.predict(val_x)

_f1 = f1_score(val_y, y_pred, average='micro')
_confusion = confusion_matrix(val_y, y_pred)
__precision = precision_score(val_y, y_pred, average='micro')
_recall = recall_score(val_y, y_pred, average='micro')
_model_accuracy = accuracy_score(val_y, y_pred) *100
_statistics = {'f1_score': _f1,
               'confusion_matrix': _confusion,
               'precision': __precision,
               'recall': _recall,
               'accuracy': _model_accuracy
              }

In [None]:
print(_statistics)
y_pred.shape

NameError: ignored

**Testing Prediction Model**

In [None]:
text = ["This love this to the core"]
test_feature = vectorizer.transform(text)
predicted = model.predict(test_feature)
_model_accuracy = accuracy_score(val_y, predicted) *100
_model_accuracy

ValueError: ignored

**Dumping the model into a Pickle format to use in the project**

In [None]:
import pickle
model_pickle = {'vectorizer': vectorizer, 'model': model, 'actual_data': val_y}
pickle.dump(model_pickle, open('models'+".p", "wb"))