<a href="https://colab.research.google.com/github/raviloartanza/GROUP_11_SVM_MODEL/blob/main/RM_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re, string, unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Dataset

IMPORT THE TRAINING DATASET

In [None]:
# Import dataset
path = "/content/drive/MyDrive/mbsa.csv"
data = pd.read_csv(path, names=['date', 'review', 'sentiment'], header=0)
filtr = data.loc[600001:]
data.drop(index=filtr.index, inplace=True)
print(data.shape)
data.head(10)

(600001, 3)


Unnamed: 0,date,review,sentiment
0,2019-05-27,È appena uscito un nuovo video! LES CRYPTOMONN...,Positive
1,2019-05-27,Cardano: Digitize Currencies; EOS https://t.co...,Positive
2,2019-05-27,Another Test tweet that wasn't caught in the s...,Positive
3,2019-05-27,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...,Positive
4,2019-05-27,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...,Positive
5,2019-05-27,#btc inceldiği yerden kopsun bakalım 17:00 ye ...,Positive
6,2019-05-27,@nwoodfine We have been building on the real #...,Positive
7,2019-05-27,"@pedronauck como investidor, vc é um ótimo dev...",Positive
8,2019-05-27,ブラジルはまぁ置いといてもドイツは存在感出してくるのかな。ロシアもマイニングなどで元気になる...,Positive
9,2019-05-27,"CHANGE IS COMING...GET READY!!! Boom, Another ...",Positive


EXPLORATERY DATA ANALYSIS

In [None]:
data.describe()

Unnamed: 0,date,review,sentiment
count,600001,600001,600001
unique,11,547388,2
top,2019-05-21,#blockchain #cryptocurrency #bitcoin #ethereum...,Positive
freq,82405,2142,418852


SENTIMENT COUNT

In [None]:
data['sentiment'].value_counts()

sentiment
Positive    418852
Negative    181149
Name: count, dtype: int64

SPLITTING THE TRAINING DATASET

In [None]:
# Train dataset
train_reviews = data.review[:300000]
train_sentiments = data.sentiment[:300000]

# Test dataset
test_reviews = data.review[300001:]
test_sentiments = data.sentiment[300001:]
print(train_reviews.shape, train_sentiments.shape)
print(test_reviews.shape, test_sentiments.shape)

(300000,) (300000,)
(300000,) (300000,)


# Pre-Processing

TEXT NORMALIZATION

In [None]:
# Tokenization
tokenizer = ToktokTokenizer()

# English stopwords
stopword_list = nltk.corpus.stopwords.words('english')

REMOVING HTML STRIPS AND NOISE TEXT

In [None]:
# Removing the html strips
def strip_html(text):
  soup = BeautifulSoup(text, "html.parser")
  return soup.get_text()

# Removing the square brackets
def remove_between_square_brackets(text):
  return re.sub('\[[^]]*\]', '', text)

# Removing the noise text
def denoise_text(text):
  if isinstance(text, str):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
  return text

data['review'] = data['review'].apply(denoise_text)

REMOVING SPECIAL CHARACTERS

In [None]:
def remove_special_characters(text, remove_digits=True):
    if not isinstance(text, str):
        text = str(text)
    pattern = r'[^a-zA-Z0-9\s]'
    text = re.sub(pattern, ' ', text)
    return text

data['review'] = data['review'].apply(remove_special_characters)

TEXT STEMMING

In [None]:
def simple_stemmer(text):
  ps = nltk.porter.PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

data['review'] = data['review'].apply(simple_stemmer)

REMOVING STOPWORDS

In [None]:
stop = set(stopwords.words('english'))
print(stop)

def remove_stopwords(text, is_lower_case=False):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  if is_lower_case:
    filtered_tokens = [token for token in tokens if token not in stopword_list]
  else:
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
  filtered_text = ' '.join(filtered_tokens)
  return filtered_text

data['review'] = data['review'].apply(remove_stopwords)

{'hasn', 'both', "weren't", 'yourselves', 'that', "it's", 'there', 'didn', 'his', 'no', 'ourselves', 'these', 'just', 'into', 'when', 'nor', 'by', 'through', 'himself', 'it', "shouldn't", 'yours', 'this', 'be', 'a', 'ma', 't', 'don', 'their', 'during', 'its', 'whom', 'weren', 'having', 'more', "won't", 'now', 'did', "wouldn't", 'any', 'is', 'ours', 'those', 'then', 'each', "mustn't", 'wasn', 'been', 'who', 'but', 'such', 'same', 'from', 'other', "hadn't", 'o', 'against', 'were', 'as', 'between', 'd', 'out', 'had', 'the', 'than', 'll', 'myself', 'off', "couldn't", 'wouldn', 'below', 'mustn', "you'd", 'are', 'at', 'won', 's', 'can', 'of', 'why', "that'll", "you'll", 'some', "isn't", 're', 'again', "don't", 'shouldn', 'or', "she's", 'do', "didn't", 'where', 'before', 'i', "doesn't", 'him', 'itself', 'because', 'in', "you're", 'so', 'very', 'they', 'hadn', 'own', 'aren', 'she', 'once', 'will', 'under', 'am', 'my', 'and', "should've", 'for', 'ain', 'herself', 'after', 'doing', 'few', 'does'

NORMALIZED TRAIN REVIEWS

In [None]:
norm_train_reviews = data.review[:300000]
norm_train_reviews[0]

'appena uscito un nuovo video le cryptomonnai qui pulv risent bitcoin en 2019 http co ycsqmvrni'

NORMALIZED TEST REVIEWS

In [None]:
norm_test_reviews = data.review[300001:]
norm_test_reviews[300001]

'liquid btc perpetu sold 760 00 btc 7968 00 26 may 2019 06 34 51 utc trade id 22087974'

BAGS OF WORDS MODEL

In [None]:
# Count vectorizer for bag of words
cv = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1,3))

# Transformed train reviews
cv_train_reviews = cv.fit_transform(norm_train_reviews)

# Transformed test reviews
cv_test_reviews = cv.transform(norm_test_reviews)

print('BOW_cv_train:', cv_train_reviews.shape)
print('BOW_cv_test:', cv_test_reviews.shape)

BOW_cv_train: (300000, 3212679)
BOW_cv_test: (300000, 3212679)


TF-IDF MODEL

In [None]:
# TF-IDF vectorizer
tv = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1,3))

# Transformed train reviews
tv_train_reviews = tv.fit_transform(norm_train_reviews)

# Transformed test reviews
tv_test_reviews = tv.transform(norm_test_reviews)

print('Tfidf_train:', tv_train_reviews.shape)
print('Tfidf_test:', tv_test_reviews.shape)

Tfidf_train: (300000, 3212679)
Tfidf_test: (300000, 3212679)


LABELING THE SENTIMENT TEXT

In [None]:
# Labeling the sentient data
lb = LabelBinarizer()

# Transformed sentiment data
sentiment_data = lb.fit_transform(data['sentiment'])
print(sentiment_data.shape)

(600001, 1)


SPLIT THE SENTIMENT DATA

In [None]:
train_sentiments = sentiment_data[:300000]
test_sentiments = sentiment_data[300001:]
print(train_sentiments)
print(test_sentiments)

[[1]
 [1]
 [1]
 ...
 [1]
 [1]
 [1]]
[[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]]


# Model

SVM MODEL

In [None]:
# Training the linear SVM
svm = SGDClassifier(loss='hinge', max_iter=500, random_state=42)

# Fitting the SVM for bag of words
svm_bow = svm.fit(cv_train_reviews, train_sentiments)
print(svm_bow)

# Fitting the SVM for TF-IDF features
svm_tfidf = svm.fit(tv_train_reviews, train_sentiments)
print(svm_tfidf)

SGDClassifier(max_iter=500, random_state=42)
SGDClassifier(max_iter=500, random_state=42)


MODEL PERFORMANCE

In [None]:
# Predicting the model for bag of words
svm_bow_predict = svm.predict(cv_test_reviews)
print(svm_bow_predict)

# Predicting the model for TF-IDF features
svm_tfidf_predict = svm.predict(tv_test_reviews)
print(svm_tfidf_predict)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


MODEL ACCURACY

In [None]:
# Accuracy score for bag of words
svm_bow_score = accuracy_score(test_sentiments, svm_bow_predict)
print("svm_bow_score :", svm_bow_score)

# Accuracy score for TF-IDF features
svm_tfidf_score = accuracy_score(test_sentiments, svm_tfidf_predict)
print("svm_tfidf_score :", svm_tfidf_score)

svm_bow_score : 0.06302666666666666
svm_tfidf_score : 0.06302666666666666


# Evaluation

CLASSIFICATION REPORT

In [None]:
# Classification report for bag of words
svm_bow_report = classification_report(test_sentiments, svm_bow_predict, target_names = ['Positive','Negative'])
print(svm_bow_report)

# Classification report for TF-IDF features
svm_tfidf_report = classification_report(test_sentiments, svm_tfidf_predict, target_names = ['Positive','Negative'])
print(svm_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.06      1.00      0.12     18908
    Negative       0.00      0.00      0.00    281092

    accuracy                           0.06    300000
   macro avg       0.03      0.50      0.06    300000
weighted avg       0.00      0.06      0.01    300000

              precision    recall  f1-score   support

    Positive       0.06      1.00      0.12     18908
    Negative       0.00      0.00      0.00    281092

    accuracy                           0.06    300000
   macro avg       0.03      0.50      0.06    300000
weighted avg       0.00      0.06      0.01    300000



CONFUSSION MATRIX

In [None]:
#confusion matrix for bag of words
cm_bow = confusion_matrix(test_sentiments, svm_bow_predict, labels = [1,0])
print(cm_bow)

#confusion matrix for TF-IDF features
cm_tfidf = confusion_matrix(test_sentiments, svm_tfidf_predict, labels = [1,0])
print(cm_tfidf)

[[     0 281092]
 [     0  18908]]
[[     0 281092]
 [     0  18908]]
