In [0]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
from sklearn import datasets, svm
from sklearn.kernel_approximation import Nystroem
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

In [0]:
import numpy as np # linear algebra
import pandas as pd

In [0]:
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8


# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [0]:
dataset_path = "/content/drive/My Drive/training.1600000.processed.noemoticon.csv"
print("Open file:", dataset_path)
df = pd.read_csv(dataset_path, encoding =DATASET_ENCODING , names=DATASET_COLUMNS)

Open file: /content/drive/My Drive/training.1600000.processed.noemoticon.csv


In [0]:
# nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [0]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()  
    ''''In re. sub() , specify a regular expression pattern in the first argument,
     a new string in the second argument, and a string to be processed in the third argument'''
    tokens = []
    for token in text.split():
      if token not in stop_words:
          if stem:
            tokens.append(stemmer.stem(token))
          else:
            tokens.append(token)
    return " ".join(tokens)

In [0]:
#Here I am preprocessing the entire dataset
df.text = df.text.apply(lambda x: preprocess(x))

In [0]:
df.head(5)

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,awww bummer shoulda got david carr third day
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,dived many times ball managed save 50 rest go ...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole body feels itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,behaving mad see


In [0]:
tf=df.sample(n=1600000)

In [0]:
stopset=set(stopwords.words("english"))
vectorizer =TfidfVectorizer(use_idf=True,lowercase=True,strip_accents='ascii',stop_words=stopset)

In [0]:
y=tf.target

In [0]:
x=vectorizer.fit_transform(tf.text)

In [0]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=42)

In [0]:
target_names = ['Negative', 'Positive']

In [0]:
clf = LinearSVC(penalty='l2',random_state=0, tol=1e-5)

In [0]:
clf.fit(x_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=1e-05,
          verbose=0)

In [0]:
predicted_value=clf.predict(x_test)

In [0]:
real=y_test.to_list()

In [0]:
print("For tfidf+Linear SVC")
print(classification_report(real, predicted_value,target_names=target_names))

For tfidf+Linear SVC
              precision    recall  f1-score   support

    Negative       0.78      0.75      0.77    239925
    Positive       0.76      0.79      0.78    240075

    accuracy                           0.77    480000
   macro avg       0.77      0.77      0.77    480000
weighted avg       0.77      0.77      0.77    480000



Below is the approcah for Naive Bayes

In [0]:
tf=df.sample(n=1600000)

In [0]:
x_trainN, x_testN, y_trainN, y_testN = train_test_split(x, y, test_size=0.3, random_state=42)

In [0]:
from sklearn import naive_bayes
Naive = naive_bayes.MultinomialNB()
Naive.fit(x_trainN,y_trainN)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
prediction_N=Naive.predict(x_testN)

In [0]:
print("This is for tfidf(unigram)+Naive Bayes")
print(classification_report(prediction_N,y_testN,target_names=target_names))

This is for tfidf(unigram)+Naive Bayes
              precision    recall  f1-score   support

    Negative       0.78      0.75      0.77    247970
    Positive       0.74      0.77      0.76    232030

    accuracy                           0.76    480000
   macro avg       0.76      0.76      0.76    480000
weighted avg       0.76      0.76      0.76    480000



In [0]:
#Below is the approach for n gram
stopset=set(stopwords.words("english"))
vectorizer =TfidfVectorizer(use_idf=True,lowercase=True,strip_accents='ascii',stop_words=stopset,ngram_range=(1,2))

In [0]:
y=tf.target
x=vectorizer.fit_transform(tf.text)

In [0]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=42)

In [0]:
x_trainN, x_testN, y_trainN, y_testN = train_test_split(x, y, test_size=0.3, random_state=42)

In [0]:
target_names = ['Negative', 'Positive']

In [0]:
clf = LinearSVC(penalty='l2',random_state=0, tol=1e-5)

In [0]:
clf.fit(x_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=1e-05,
          verbose=0)

In [0]:
predicted_value=clf.predict(x_test)
real=y_test.to_list()

In [0]:
print("For tfidf+(unigram_bigram)+Linear SVC")
print(classification_report(real, predicted_value,target_names=target_names))

For tfidf+(unigram_bigram)+Linear SVC
              precision    recall  f1-score   support

    Negative       0.80      0.76      0.78    239714
    Positive       0.77      0.80      0.79    240286

    accuracy                           0.78    480000
   macro avg       0.78      0.78      0.78    480000
weighted avg       0.78      0.78      0.78    480000



In [0]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(x_trainN,y_trainN)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
prediction_N=Naive.predict(x_testN)

In [0]:
print("This is for tfidf(unigram+bigram)+Naive Bayes")
print(classification_report(prediction_N,y_testN,target_names=target_names))

This is for tfidf(unigram+bigram)+Naive Bayes
              precision    recall  f1-score   support

    Negative       0.80      0.77      0.78    249626
    Positive       0.76      0.79      0.77    230374

    accuracy                           0.78    480000
   macro avg       0.78      0.78      0.78    480000
weighted avg       0.78      0.78      0.78    480000

