# Include Relevant Imports

In [None]:
# Required imports
import operator

import pandas as pd
import numpy as np
import math
import nltk

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

%matplotlib inline

In [None]:
# Required authentication to read Google Drive files
# Dataset is stored in Google Drive
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Get file IDs for reading
# file_list = drive.ListFile({'q': "'1D4X5HSBb2TnFYTy4JM7xgEo-4EYDSwwB' in parents"}).GetList()
# for file1 in file_list:       
#     print ('title: %s, id: %s' % (file1['title'], file1['id']))

# Load dataset
downloaded = drive.CreateFile({"id":"1rLHp0rg0T5deyUSkCQ0Q27Skn2xQfDiO"})
downloaded.GetContentFile("twitter_data_cleaned.csv")


# Data Pre Processing

In [None]:
pd.options.display.max_colwidth = 500

tweets = pd.read_csv('twitter_data_cleaned.csv')
# Remove leftmost column
tweets = tweets.drop(columns=['Unnamed: 0'])
# Replace unlabelled columns with empty string
tweets['Target'] = tweets['Target'].replace(np.nan, ' ', regex=True)
tweets['tweet_text'] = tweets['tweet_text'].replace(np.nan, ' ', regex=True)
tweets.head()

Unnamed: 0,Tweeted_at,tweet_id,tweet_text,tweet_retweets,tweet_likes,tweet_username,Target
0,2021-04-02 07:30:27,1377886129475751937,Analysis: Power play: India wields oil 'weapon' to cut dependence on Saudi,2,1,ReutersBiz,ON
1,2021-04-02 07:15:27,1377882354950275072,Indonesia expands tax breaks for sales of bigger cars,0,4,ReutersBiz,ON
2,2021-04-02 07:00:00,1377878465467330561,The fate of food delivery start-up Deliveroo's IPO on the London stock market may be a sign that investors are beco…,1,6,ReutersBiz,ON
3,2021-04-02 06:45:11,1377874738312314882,Dollar heads for third weekly gain as payrolls data looms,5,15,ReutersBiz,ON
4,2021-04-02 05:45:10,1377859630785777665,Dollar steadies before U.S. payrolls as sentiment improves,5,13,ReutersBiz,OP


In [None]:
# Sanity check
print("Shape: ", tweets.shape)
print("Target labels enums: ", tweets.Target.unique())

Shape:  (14060, 7)
Target labels enums:  ['ON' 'OP' 'NT' ' ']


In [None]:
# Create new "Opinion" column based on first letter of "Target" column, for subjectivity detection
tweets['Opinion'] = tweets['Target'].astype(str).str[0]

tweets.head()

Unnamed: 0,Tweeted_at,tweet_id,tweet_text,tweet_retweets,tweet_likes,tweet_username,Target,Opinion
0,2021-04-02 07:30:27,1377886129475751937,Analysis: Power play: India wields oil 'weapon' to cut dependence on Saudi,2,1,ReutersBiz,ON,O
1,2021-04-02 07:15:27,1377882354950275072,Indonesia expands tax breaks for sales of bigger cars,0,4,ReutersBiz,ON,O
2,2021-04-02 07:00:00,1377878465467330561,The fate of food delivery start-up Deliveroo's IPO on the London stock market may be a sign that investors are beco…,1,6,ReutersBiz,ON,O
3,2021-04-02 06:45:11,1377874738312314882,Dollar heads for third weekly gain as payrolls data looms,5,15,ReutersBiz,ON,O
4,2021-04-02 05:45:10,1377859630785777665,Dollar steadies before U.S. payrolls as sentiment improves,5,13,ReutersBiz,OP,O


In [None]:
tweets['totalwords'] = tweets['tweet_text'].str.split().str.len()
total_word_count = tweets['totalwords'].sum()
print(total_word_count)

# Getting number of unique words is kinda bugged because tweets like no. 10978 are empty tweets!!!
from collections import Counter
result = Counter(" ".join(tweets['tweet_text'].values.tolist()).split(" ")).items()
print (len((list(result))))

214932
30688


In [None]:
# Dataframes which only contain labelled/unlabelled tweets
labelled_tweets = tweets[tweets['Target'] != ' ']
unlabelled_tweets = tweets[tweets['Target'] == ' ']

# Convert to Series
X = labelled_tweets['tweet_text']
y = labelled_tweets['Opinion']

# Data cleaning

Steps done:

1. Remove twitter URLs (already done prior to importing csv file)

2. Remove stopwords 

3. Remove punctuation

4. Stemming 

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from nltk.stem import PorterStemmer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import re

def cleaning(X):
  cleaned_data=[]
  stop_words = stopwords.words('english')
  punctuations = string.punctuation
  stemmer = PorterStemmer()
  for i in range(len(X)):
    tweet=re.sub('[^a-zA-Z]',' ',X.iloc[i])
    tweet=tweet.lower().split()
    tweet=[stemmer.stem(word) for word in tweet if (word not in stop_words) and (word not in punctuations)]
    tweet=' '.join(tweet)
    cleaned_data.append(tweet)
  return cleaned_data 
   

In [None]:
cleaned_data = cleaning(X)

# Show the first 5 stemmed tweets
for tweet in cleaned_data[0:5]:
  print(tweet)

analysi power play india wield oil weapon cut depend saudi
indonesia expand tax break sale bigger car
fate food deliveri start deliveroo ipo london stock market may sign investor beco
dollar head third weekli gain payrol data loom
dollar steadi u payrol sentiment improv


In [None]:
# Here, everything in Series y is either opinionated or unopinionated
# For subjectivity detection
y

0        O
1        O
2        O
3        O
4        O
        ..
12795    O
12796    N
12797    N
12798    N
12799    O
Name: Opinion, Length: 2164, dtype: object

In [None]:
sentiment_ordering = ['O', 'N']

# Convert to integers for binning 
y = y.apply(lambda x: sentiment_ordering.index(x))
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Opinion, dtype: int64

# Convert tweets into 'Bag of Words"

Create a matrix table, where each row represents a tweet and each word will have separate columns for itself that represents its frequency.

One con about this method is that the order of the sentence is lost.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=3000)
X_fin = cv.fit_transform(cleaned_data).toarray()
X_fin.shape

(2164, 3000)

# Preliminary Results for **Subjectivity Detection**

Weighted average f1 score takes into account label imbalance and assigns a weight for each bin based on sample count i.e. majority bin will have the greatest weight. 

1. Multi-nomial Naive Bayes -> 0.72

2. Multi-layer Perceptron (MLP) -> 0.71

3. Support Vection Machine (SVM) -> 0.70

4. Random forest classifier -> 0.69

5. Gradient Boosting Classifier -> 0.72

While hyper parameter tuning ws implemented, we found that it was not useful to the overall accuracy of the algorithm.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_fin, y, test_size=0.3,random_state=42)

# Multinomial Naive Bayes model 

In theory, works well for text based data

In [None]:
from sklearn.naive_bayes import MultinomialNB

"""
Multinomial Naive Bayes model

"""
model = MultinomialNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

cf = classification_report(y_test, y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.65      0.87      0.74       313
           1       0.82      0.55      0.66       337

    accuracy                           0.71       650
   macro avg       0.73      0.71      0.70       650
weighted avg       0.74      0.71      0.70       650



# Random forest classifier


In [None]:
from sklearn.ensemble import RandomForestClassifier

"""
Random Forest Classifier

"""
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
report=classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.64      0.88      0.74       313
           1       0.82      0.54      0.65       337

    accuracy                           0.70       650
   macro avg       0.73      0.71      0.69       650
weighted avg       0.73      0.70      0.69       650



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

"""
Random Forest Classifier

"""
rf = RandomForestClassifier(3)

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
# pprint(random_grid)

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=43, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_


# Gradient Boosting Classifier

In [None]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
#from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV

"""
Gradient Boosting Classifier

"""
clf = GradientBoostingClassifier(random_state = 42).fit(X_train, y_train)
y_pred=clf.predict(X_test)
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.61      0.92      0.73       313
           1       0.85      0.45      0.59       337

    accuracy                           0.68       650
   macro avg       0.73      0.69      0.66       650
weighted avg       0.74      0.68      0.66       650



In [None]:
"""
Gradient Boosting Classifier Hyperparameter Optimization 

"""

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

gbc =  GradientBoostingClassifier(random_state = 42)

n_estimators = [500, 600, 700, 800, 900]
max_features = [2,4,6,8,10]
max_depth = [2,4,6,8,10]
min_samples_split = [5, 6,7,8,9,10]
min_samples_leaf = [1, 2, 4]
loss = ['deviance', 'exponential']
criterion = ['friedman_mse', 'mse', 'mae']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'criterion': criterion,
               'loss': loss
               }

gbc_random = RandomizedSearchCV(estimator = gbc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, n_jobs = -1)

# Fit the random search model
gbc_random.fit(X_train, y_train)
gbc_random.best_params_

#classfication report
y_pred = gbc_random.predict(X_test)
report = classification_report(y_test,y_pred)
print(report)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 11.8min finished


              precision    recall  f1-score   support

           0       0.70      0.85      0.77       328
           1       0.80      0.63      0.70       322

    accuracy                           0.74       650
   macro avg       0.75      0.74      0.73       650
weighted avg       0.75      0.74      0.74       650



In [None]:
gbc_random.best_params_

{'criterion': 'mse',
 'loss': 'exponential',
 'max_depth': 4,
 'max_features': 4,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 700}

# Multi-layer Perceptron (MLP) Classifier

A form of neural network

In [None]:
from sklearn.neural_network import MLPClassifier

"""
Multi-layer Perceptron (MLP) Classifier

"""
mlp_clf = MLPClassifier(random_state=42)
mlp_clf.fit(X_train, y_train)
y_pred = mlp_clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.78      0.73       313
           1       0.76      0.66      0.70       337

    accuracy                           0.72       650
   macro avg       0.72      0.72      0.72       650
weighted avg       0.72      0.72      0.71       650



In [None]:
mlp =  MLPClassifier(hidden_layer_sizes=800)

hidden_layer_sizes = [100,200,300]

# Create the random grid
params = {'hidden_layer_sizes': n_estimators
          }

#mlp_random = RandomizedSearchCV(estimator = mlp, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
mlp.fit(X_train, y_train)
#mlp.best_params_

#classfication report
y_pred = mlp.predict(X_test)
report = classification_report(y_test,y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.70      0.76      0.73       334
           1       0.73      0.66      0.69       316

    accuracy                           0.71       650
   macro avg       0.72      0.71      0.71       650
weighted avg       0.71      0.71      0.71       650



# Support vector machine (SVM) classifier

In [None]:
from sklearn.linear_model import SGDClassifier

"""
Support vector machine (SVM) classifier

"""
svm_sgd_clf = SGDClassifier(random_state=42)
svm_sgd_clf.fit(X_train, y_train)

y_pred = svm_sgd_clf.predict(X_test)
print(classification_report(y_test, y_pred))
metrics.confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.68      0.73      0.71       313
           1       0.73      0.68      0.71       337

    accuracy                           0.71       650
   macro avg       0.71      0.71      0.71       650
weighted avg       0.71      0.71      0.71       650



array([[229,  84],
       [107, 230]])

In [None]:
"""
Support vector machine (SVM) classifier
Classification report

"""
svm_sgd_clf = SGDClassifier(random_state=42)
parameters = {'alpha': (0.0001, 0.001),
              'power_t':(0.5, 1.00),
              'epsilon': (0.01, 1),
              'loss': [ 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
              #'penalty': ['l2', 'l1', 'elasticnet'],
              #'fit_intercept': [True, False],
              #'tol': (0.00001, 0.01),
              #'learning_rate': ['constant','optimal', 'invscaling', 'adaptive'],
              #'eta0':(0.0001, 1)
              }
svm_sgd_clf = GridSearchCV(svm_sgd_clf, parameters, n_jobs=-1)
svm_sgd_clf = svm_sgd_clf.fit(X_train, y_train)


y_pred = svm_sgd_clf.predict(X_test)
print(classification_report(y_test, y_pred))
metrics.confusion_matrix(y_test, y_pred)
print(svm_sgd_clf.best_params_)


              precision    recall  f1-score   support

           0       0.73      0.75      0.74       328
           1       0.74      0.71      0.73       322

    accuracy                           0.73       650
   macro avg       0.73      0.73      0.73       650
weighted avg       0.73      0.73      0.73       650

{'alpha': 0.001, 'epsilon': 0.01, 'loss': 'log', 'power_t': 0.5}


In [None]:
from sklearn import linear_model

"""
Logistic regression

"""

LogReg = linear_model.LogisticRegression()
LogReg.fit(X_train, y_train)
y_predict = LogReg.predict(X_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.67      0.82      0.73       313
           1       0.79      0.62      0.69       337

    accuracy                           0.72       650
   macro avg       0.73      0.72      0.71       650
weighted avg       0.73      0.72      0.71       650



In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state = 42)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.749444 using {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.737772 (0.032665) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.737552 (0.032426) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.737992 (0.032579) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.744372 (0.035004) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.744372 (0.035004) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.744372 (0.034499) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.749227 (0.031325) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.749227 (0.031325) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.749444 (0.031045) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.731618 (0.032293) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.731618 (0.032293) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.733823 (0.033321) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.687586 (0.028093) wit

In [None]:
y_predict = grid_result.predict(X_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.67      0.82      0.73       313
           1       0.79      0.62      0.69       337

    accuracy                           0.72       650
   macro avg       0.73      0.72      0.71       650
weighted avg       0.73      0.72      0.71       650



# Polarity Detection Section



In [None]:
# Create new "Polarity" column based on second letter of "Target" column, for polarity detection
tweets['Polarity'] = tweets['Target'].astype(str).str[1]
tweets.head()

Unnamed: 0,Tweeted_at,tweet_id,tweet_text,tweet_retweets,tweet_likes,tweet_username,Target,Opinion,totalwords,Polarity
0,2021-04-02 07:30:27,1377886129475751937,Analysis: Power play: India wields oil 'weapon' to cut dependence on Saudi,2,1,ReutersBiz,ON,O,12.0,N
1,2021-04-02 07:15:27,1377882354950275072,Indonesia expands tax breaks for sales of bigger cars,0,4,ReutersBiz,ON,O,9.0,N
2,2021-04-02 07:00:00,1377878465467330561,The fate of food delivery start-up Deliveroo's IPO on the London stock market may be a sign that investors are beco…,1,6,ReutersBiz,ON,O,21.0,N
3,2021-04-02 06:45:11,1377874738312314882,Dollar heads for third weekly gain as payrolls data looms,5,15,ReutersBiz,ON,O,10.0,N
4,2021-04-02 05:45:10,1377859630785777665,Dollar steadies before U.S. payrolls as sentiment improves,5,13,ReutersBiz,OP,O,8.0,P


# Making sure dataset is not too biased towards one class

Number of labelled tweets: 2164 / 14060 (~15% labelled)

Number of opinionated tweets: 1117 (~52 % opinionated, ~48% neutral)

Positive to Negative ratio: 603:514 (~54:46)

TL:DR Seems good

In [None]:
# Dataframes which only contain labelled/unlabelled tweets
labelled_tweets_polarity = tweets[(tweets['Polarity'].notnull()) & (tweets['Polarity'] != 'T')] # Only labelled N/P
unlabelled_tweets_polarity = tweets[tweets['Polarity'].isnull()] # No label i.e. NaN

# Convert to Series
X_polarity = labelled_tweets_polarity['tweet_text']
y_polarity = labelled_tweets_polarity['Polarity']

# Sanity check
print("Number of labelled tweets: ", labelled_tweets.shape[0])
print("Number of opinionated tweets: {} ({:0.2f} %)".format(labelled_tweets_polarity.shape[0], 
                                                   labelled_tweets_polarity.shape[0]/labelled_tweets.shape[0]*100)) 
print("Polarity labels enums: ", labelled_tweets_polarity.Polarity.unique())

print("Positive labels: ", labelled_tweets_polarity[labelled_tweets_polarity['Polarity'] == 'P'].shape[0])
print("Negative labels: ", labelled_tweets_polarity[labelled_tweets_polarity['Polarity'] == 'N'].shape[0])


Number of labelled tweets:  2164
Number of opinionated tweets: 1117 (51.62 %)
Polarity labels enums:  ['N' 'P']
Positive labels:  603
Negative labels:  514


In [None]:
cleaned_data_polarity = cleaning(X_polarity)

# Show the first 5 stemmed tweets
for tweet in cleaned_data_polarity[0:5]:
  print(tweet)

analysi power play india wield oil weapon cut depend saudi
indonesia expand tax break sale bigger car
fate food deliveri start deliveroo ipo london stock market may sign investor beco
dollar head third weekli gain payrol data loom
dollar steadi u payrol sentiment improv


In [None]:
# Here, everything in Series y is either positive or negative sentiment
# For polairty detection
y_polarity

0        N
1        N
2        N
3        N
4        P
        ..
12787    P
12793    P
12794    N
12795    N
12799    P
Name: Polarity, Length: 1117, dtype: object

In [None]:
polarity_ordering = ['N', 'P']

# Convert to integers for binning 
y_polarity = y_polarity.apply(lambda x: polarity_ordering.index(x))
y_polarity.head()

0    0
1    0
2    0
3    0
4    1
Name: Polarity, dtype: int64

# Convert tweets into 'Bag of Words"

Create a matrix table, where each row represents a tweet and each word will have separate columns for itself that represents its frequency.

One con about this method is that the order of the sentence is lost.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000)
X_fin_polarity = cv.fit_transform(cleaned_data_polarity).toarray()
X_fin_polarity.shape

(1117, 3000)

# Preliminary Results for **Polarity Detection**

Weighted average f1 score takes into account label imbalance and assigns a weight for each bin based on sample count i.e. majority bin will have the greatest weight. 

Before any hyperparameter tuning is done, in order of weighted f1 score:

1. Multi-layer Perceptron (MLP) -> 0.78

2. Support Vection Machine (SVM) -> 0.76

3. Multi-nomial Naive Bayes -> 0.75

4. Random forest classifier -> 0.74

5. Gradient Boosting Classifier -> 0.73

In [None]:
from sklearn.model_selection import train_test_split

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_fin_polarity, y_polarity, test_size=0.3, random_state = 42)

In [None]:
from sklearn.naive_bayes import MultinomialNB

"""
Multinomial Naive Bayes model

"""
model = MultinomialNB()
model.fit(X_train_p, y_train_p)
y_pred_p = model.predict(X_test_p)

In [None]:
from sklearn.metrics import classification_report

cf = classification_report(y_test_p, y_pred_p)
print(cf)

              precision    recall  f1-score   support

           0       0.76      0.76      0.76       161
           1       0.78      0.78      0.78       175

    accuracy                           0.77       336
   macro avg       0.77      0.77      0.77       336
weighted avg       0.77      0.77      0.77       336



# Random forest classifier


In [None]:
from sklearn.ensemble import RandomForestClassifier

"""
Random Forest Classifier

"""
clf = RandomForestClassifier()
clf.fit(X_train_p, y_train_p)
y_pred_p = clf.predict(X_test_p)
report=classification_report(y_test_p, y_pred_p)
print(report)

              precision    recall  f1-score   support

           0       0.89      0.59      0.71       161
           1       0.71      0.93      0.81       175

    accuracy                           0.77       336
   macro avg       0.80      0.76      0.76       336
weighted avg       0.80      0.77      0.76       336



# Gradient Boosting Classifier

In [None]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

"""
Gradient Boosting Classifier

"""
clf = GradientBoostingClassifier().fit(X_train_p, y_train_p)
y_pred_p=clf.predict(X_test_p)
report=classification_report(y_test_p,y_pred_p)
print(report)

              precision    recall  f1-score   support

           0       0.80      0.56      0.66       161
           1       0.68      0.87      0.76       175

    accuracy                           0.72       336
   macro avg       0.74      0.71      0.71       336
weighted avg       0.74      0.72      0.71       336



# Multi-layer Perceptron (MLP) Classifier

A form of neural network

In [None]:
from sklearn.neural_network import MLPClassifier

"""
Multi-layer Perceptron (MLP) Classifier

"""
mlp_clf = MLPClassifier()
mlp_clf.fit(X_train_p, y_train_p)
y_pred_p = mlp_clf.predict(X_test_p)

print(classification_report(y_test_p, y_pred_p))

              precision    recall  f1-score   support

           0       0.78      0.77      0.77       161
           1       0.79      0.79      0.79       175

    accuracy                           0.78       336
   macro avg       0.78      0.78      0.78       336
weighted avg       0.78      0.78      0.78       336



# Support vector machine (SVM) classifier

In [None]:
from sklearn.linear_model import SGDClassifier

"""
Support vector machine (SVM) classifier

"""
svm_sgd_clf = SGDClassifier()
svm_sgd_clf.fit(X_train_p, y_train_p)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
y_pred_p = svm_sgd_clf.predict(X_test_p)

"""
Support vector machine (SVM) classifier
Classification report

"""
print(classification_report(y_test_p, y_pred_p))

              precision    recall  f1-score   support

           0       0.76      0.81      0.78       161
           1       0.81      0.76      0.78       175

    accuracy                           0.78       336
   macro avg       0.78      0.78      0.78       336
weighted avg       0.78      0.78      0.78       336



In [None]:
from sklearn import linear_model

"""
Logistic regression

"""

LogReg = linear_model.LogisticRegression()
LogReg.fit(X_train_p, y_train_p)
y_predict_p = LogReg.predict(X_test_p)
print(classification_report(y_test_p, y_predict_p))

              precision    recall  f1-score   support

           0       0.76      0.76      0.76       161
           1       0.78      0.78      0.78       175

    accuracy                           0.77       336
   macro avg       0.77      0.77      0.77       336
weighted avg       0.77      0.77      0.77       336



## Performance Metrics (MLP Classifier)

In [None]:
unlabelled_tweets['tweet_text'] = unlabelled_tweets['tweet_text'].replace(np.nan, ' ', regex=True)

X_unlabelled = unlabelled_tweets['tweet_text']
unlabelled_cleaned = cleaning(X_unlabelled)

# Show the first 5 stemmed tweets
for tweet in unlabelled_cleaned[0:5]:
  print(tweet)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


south korea feb factori activ grow fastest pace eight month
volkswagen soon known voltswagen u effect may intend flag vw big move
china brazil world greenest central bank activist say
u stock end modestli lower investor pull heavyweight tech stock spook new high trea
nation labor relat board begun review ballot amazon worker alabama vote w


In [None]:
cv=CountVectorizer(max_features=3000)
X_fin_unlabelled = cv.fit_transform(unlabelled_cleaned).toarray()
X_fin_unlabelled.shape

(11896, 3000)

In [None]:
import time
# Multi-layer Perceptron Subjective Classification

"""
Subject

"""
mlp_clf = MLPClassifier(random_state=42)
mlp_clf.fit(X_train, y_train)
start = time.time()
y_mlp_s = mlp_clf.predict(X_fin_unlabelled)
end = time.time()
print(y_mlp_s)
print('Classfication Time:', end-start)

[1 1 0 ... 1 0 0]
Classfication Time: 0.3634805679321289


In [None]:
unlabelled_tweets['Opinion'] = y_mlp_s
unlabelled_tweets[unlabelled_tweets['Opinion'] == 1][['tweet_text','Opinion']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,tweet_text,Opinion
300,South Korea's Feb factory activity grows at fastest pace in eight months,1
301,"Volkswagen will soon be known as 'Voltswagen' in the U.S., effective from May and is intended to flag VW's big move…",1
304,The National Labor Relations Board has begun reviewing ballots from Amazon’s workers in Alabama who have voted on w…,1
305,Pandemic pushes gender equality back a generation - WEF,1
306,Analysis: Archegos meltdown set to intensify shadow banking regulatory scrutiny,1


In [None]:
unlabelled_tweets[unlabelled_tweets['Opinion'] == 0][['tweet_text','Opinion']].head()

Unnamed: 0,tweet_text,Opinion
302,"China and Brazil have world's greenest central banks, activists say",0
303,"U.S. stocks ended modestly lower as investors pulled out of heavyweight tech stocks, spooked by a new high in Treas…",0
308,China's March factory activity expands at faster pace: official PMI,0
310,Growth in China's services sector surges in March,0
312,Markets in first-quarter: Riding a tiger and waking some bears,0


In [None]:
from google.colab import drive
drive.mount('/drive', force_remount=True)

unlabelled_tweets.to_csv('/drive/My Drive/serene/unlabelled_opinion.csv')

Mounted at /drive


In [None]:
X_pol = unlabelled_tweets[unlabelled_tweets['Opinion'] == 0]['tweet_text']
#X_unlabelled = unlabelled_tweets.replace(np.nan, ' ', regex=True)
unlabelled_cleaned = cleaning(X_pol)

## Show the first 5 stemmed tweets
for tweet in unlabelled_cleaned[0:5]:
  print(tweet)

china brazil world greenest central bank activist say
u stock end modestli lower investor pull heavyweight tech stock spook new high trea
china march factori activ expand faster pace offici pmi
growth china servic sector surg march
market first quarter ride tiger wake bear


In [None]:
cv=CountVectorizer(max_features=3000)
X_fin_unlabelled = cv.fit_transform(unlabelled_cleaned).toarray()
X_fin_unlabelled.shape

(7251, 3000)

In [None]:
"""
Polarity

"""

mlp_clf = MLPClassifier(random_state=42)
mlp_clf.fit(X_train_p, y_train_p)

start = time.time()
y_mlp_p = mlp_clf.predict(X_fin_unlabelled)
end = time.time()

print(len(y_mlp_p))
print('Classfication Time:', end-start)

7251
Classfication Time: 0.23314857482910156


In [None]:
data = unlabelled_tweets[unlabelled_tweets['Opinion'] == 0]
data['Target'] = y_mlp_p

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
data.head()

Unnamed: 0,Tweeted_at,tweet_id,tweet_text,tweet_retweets,tweet_likes,tweet_username,Target,Opinion,totalwords
302,2021-03-31 04:00:13,1377108444872765445,"China and Brazil have world's greenest central banks, activists say",5,5,ReutersBiz,1,0,10
303,2021-03-31 04:00:00,1377108390233657344,"U.S. stocks ended modestly lower as investors pulled out of heavyweight tech stocks, spooked by a new high in Treas…",0,1,ReutersBiz,0,0,20
308,2021-03-31 03:00:10,1377093333017841664,China's March factory activity expands at faster pace: official PMI,3,10,ReutersBiz,0,0,10
310,2021-03-31 02:45:10,1377089556621709312,Growth in China's services sector surges in March,6,10,ReutersBiz,1,0,8
312,2021-03-31 02:15:09,1377082003514855425,Markets in first-quarter: Riding a tiger and waking some bears,4,13,ReutersBiz,1,0,10


In [None]:
data.to_csv('/drive/My Drive/serene/unlabelled_opinion_auto.csv')

# Performance Metric (SVM classifer) 

In [None]:
unlabelled_tweets['tweet_text'] = unlabelled_tweets['tweet_text'].replace(np.nan, ' ', regex=True)

X_unlabelled = unlabelled_tweets['tweet_text']
unlabelled_cleaned = cleaning(X_unlabelled)

# Show the first 5 stemmed tweets
for tweet in unlabelled_cleaned[0:5]:
  print(tweet)

cv=CountVectorizer(max_features=3000)
X_fin_unlabelled = cv.fit_transform(unlabelled_cleaned).toarray()
X_fin_unlabelled.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


south korea feb factori activ grow fastest pace eight month
volkswagen soon known voltswagen u effect may intend flag vw big move
china brazil world greenest central bank activist say
u stock end modestli lower investor pull heavyweight tech stock spook new high trea
nation labor relat board begun review ballot amazon worker alabama vote w


(11896, 3000)

In [None]:
import time
# Multi-layer Perceptron Subjective Classification

"""
Subject

"""
mlp_clf = SGDClassifier(random_state=42)
mlp_clf.fit(X_train, y_train)
start = time.time()
y_mlp_s = mlp_clf.predict(X_fin_unlabelled)
end = time.time()
print(y_mlp_s)
print('Classfication Time:', end-start)

[1 1 0 ... 0 0 0]
Classfication Time: 0.08064651489257812


In [None]:
unlabelled_tweets['Opinion'] = y_mlp_s
X_pol = unlabelled_tweets[unlabelled_tweets['Opinion'] == 0]['tweet_text']
#X_unlabelled = unlabelled_tweets.replace(np.nan, ' ', regex=True)
unlabelled_cleaned = cleaning(X_pol)

## Show the first 5 stemmed tweets
for tweet in unlabelled_cleaned[0:5]:
  print(tweet)
  
cv=CountVectorizer(max_features=3000)
X_fin_unlabelled = cv.fit_transform(unlabelled_cleaned).toarray()
X_fin_unlabelled.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


china brazil world greenest central bank activist say
u stock end modestli lower investor pull heavyweight tech stock spook new high trea
china march factori activ expand faster pace offici pmi
growth china servic sector surg march
market first quarter ride tiger wake bear


(7611, 3000)

In [None]:
#unlabelled_tweets.to_csv('/drive/My Drive/serene/unlabelled_opinion_svm.csv')

In [None]:
"""
Polarity

"""

mlp_clf = SGDClassifier(random_state=42)
mlp_clf.fit(X_train_p, y_train_p)

start = time.time()
y_mlp_p = mlp_clf.predict(X_fin_unlabelled)
end = time.time()

print(len(y_mlp_p))
print('Classfication Time:', end-start)

7611
Classfication Time: 0.055969953536987305


In [None]:
#data = unlabelled_tweets[unlabelled_tweets['Opinion'] == 0]
#data['Target'] = y_mlp_p
#data.to_csv('/drive/My Drive/serene/unlabelled_opinion_auto_svm.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Enhanced Classification (Staking Ensemble)

In [None]:
def Stacking(model,train,y,test,n_fold):
  folds=StratifiedKFold(n_splits=n_fold,random_state=1)
  test_pred=np.empty((test.shape[0],1),float)
  train_pred=np.empty((0,1),float)
  for train_indices,val_indices in folds.split(train,y.values):
    x_train,x_val=train[train_indices],train[val_indices]
    y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]

    model.fit(X=x_train,y=y_train)
    train_pred=np.append(train_pred,model.predict(x_val))
    test_pred=np.append(test_pred,model.predict(test))
  return test_pred.reshape(-1,1),train_pred

In [None]:
from sklearn.ensemble import RandomForestClassifier

model1 = RandomForestClassifier()

test_pred1 ,train_pred1=Stacking(model=model1,n_fold=10, train=X_train,test=X_test,y=y_train)

train_pred1=pd.DataFrame(train_pred1)
test_pred1=pd.DataFrame(test_pred1)



In [None]:
# from sklearn.linear_model import SGDClassifier

# model2 = SGDClassifier(random_state=42)

# test_pred2 ,train_pred2=Stacking(model=model2,n_fold=10,train=X_train,test=X_test,y=y_train)

# train_pred2=pd.DataFrame(train_pred2)
# test_pred2=pd.DataFrame(test_pred2)

from sklearn.naive_bayes import MultinomialNB

"""
Multinomial Naive Bayes model

"""
model2 = MultinomialNB()

test_pred2 ,train_pred2=Stacking(model=model2,n_fold=10,train=X_train,test=X_test,y=y_train)
train_pred2=pd.DataFrame(train_pred2)
test_pred2=pd.DataFrame(test_pred2)


# y_pred_p = model.predict(X_test_p)



In [None]:
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer

# LogReg = linear_model.LogisticRegression()
# LogReg.fit(X_train, y_train)
# y_predict = LogReg.predict(X_test)
# print(classification_report(y_test, y_predict))

df = pd.concat([train_pred1, train_pred2], axis=1)
df_test = pd.concat([test_pred1, test_pred2], axis=1)


stacking_model = linear_model.LogisticRegression()
# stacking_model = MultinomialNB()
stacking_model.fit(df,y_train)


# tvect = TfidfVectorizer(min_df=1, max_df=1)
# new_X_test = tvect.fit_transform(X_test)
# new_X_test=tvect.transform(X_test)
# stacking_model.score(df_test, y_test)

new_X_test = X_test[:, :2]
y_predict_stack = stacking_model.predict(new_X_test)
print(classification_report(y_test, y_predict_stack))

# model = LogisticRegression(random_state=1)
# model.fit(df,y_train)
# model.score(df_test, y_test)

# model = linear_model.LogisticRegression()
# model.fit(df,y_train)
# y_predict = model.predict(X_test)
# print(classification_report(df_test, y_predict))
# # model.score(df_test, y_predict)




              precision    recall  f1-score   support

           0       0.51      1.00      0.68       333
           1       1.00      0.00      0.01       317

    accuracy                           0.51       650
   macro avg       0.76      0.50      0.34       650
weighted avg       0.75      0.51      0.35       650



In [None]:
# models = [('lr',LogisticRegression()),('svm',make_pipeline(StandardScaler(),SVC()))
# stacking = StackingClassifier(estimators=models)
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier




def get_models():
	models = dict()
	# models['lr'] = LogisticRegression()
	# models['knn'] = KNeighborsClassifier()
	# models['cart'] = DecisionTreeClassifier()
	# models['svm'] = SVC()
	# models['bayes'] = GaussianNB()
	models['stacking'] = get_stacking()
	return models



def evaluate_model(model, X, y):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
	return scores


def get_stacking():
  # define the base models
  level0 = list()
  # level0.append(('lr', LogisticRegression()))
  # level0.append(('knn', KNeighborsClassifier()))
  # level0.append(('cart', DecisionTreeClassifier()))
  level0.append(('sgd', SGDClassifier(random_state=42)))
  level0.append(('bayes', MultinomialNB()))
  # level0.append(('randomforest', RandomForestClassifier()))
  # define meta learner model
  level1 = LogisticRegression()
  # define the stacking ensemble
  model = StackingClassifier(estimators=level0, final_estimator=level1, cv=2)
  return model

def get_stacking_2():
  # define the base models
  level0 = list()
  # level0.append(('lr', LogisticRegression()))
  # level0.append(('knn', KNeighborsClassifier()))
  # level0.append(('cart', DecisionTreeClassifier()))
  # level0.append(('bayes', MultinomialNB()))
  level0.append(('sgd', SGDClassifier(random_state=42)))
  level0.append(('randomforest', RandomForestClassifier()))
  # define meta learner model
  level1 = LogisticRegression()
  # define the stacking ensemble
  model = StackingClassifier(estimators=level0, final_estimator=level1, cv=2)
  return model


def get_stacking_3():
  # define the base models
  level0 = list()
  # level0.append(('lr', LogisticRegression()))
  # level0.append(('knn', KNeighborsClassifier()))
  # level0.append(('cart', DecisionTreeClassifier()))
  level0.append(('bayes', MultinomialNB()))
  # level0.append(('sgd', SGDClassifier(random_state=42)))
  level0.append(('randomforest', RandomForestClassifier()))
  # define meta learner model
  level1 = LogisticRegression()
  # define the stacking ensemble
  model = StackingClassifier(estimators=level0, final_estimator=level1, cv=2)
  return model


def get_stacking_4():
  # define the base models
  level0 = list()
  # level0.append(('lr', LogisticRegression()))
  # level0.append(('knn', KNeighborsClassifier()))
  # level0.append(('cart', DecisionTreeClassifier()))
  level0.append(('bayes', MultinomialNB()))
  level0.append(('logistics', LogisticRegression()))
  # level0.append(('randomforest', RandomForestClassifier()))
  # define meta learner model
  level1 = SGDClassifier(random_state=42)
  # define the stacking ensemble
  model = StackingClassifier(estimators=level0, final_estimator=level1, cv=2)
  return model


# def get_stacking_3():
#   # define the base models
#   level0 = list()
#   # level0.append(('lr', LogisticRegression()))
#   # level0.append(('knn', KNeighborsClassifier()))
#   level0.append(('bayes', MultinomialNB()))
#   level0.append(('logistics', LogisticRegression())
#   # level0.append(('bayes', MultinomialNB()))
#   # level0.append(('randomforest', RandomForestClassifier()))
#   # define meta learner model
#   level1 = SGDClassifier(random_state=42)
#   # define the stacking ensemble
#   model = StackingClassifier(estimators=level0, final_estimator=level1, cv=2)
#   return model

# model = MultinomialNB()
# model.fit(X_train,y_train)
# y_pred = model.predict(X_test)



# cf = classification_report(y_test, y_pred)
# print(cf)


# models = get_models()

# results, names = list(), list()
# for name, model in models.items():
# 	scores = evaluate_model(model, X, y)
# 	results.append(scores)
# 	names.append(name)
# 	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))


# names = list(), list()
# for model in models.items():
#   model.fit(X_train,y_train)
#   y_pred = model.predict(X_test)
#   cf = classification_report(y_test, y_pred)
  
#   # print(names)
#   print(cf)

tester_model1 = get_stacking()
tester_model1.fit(X_train,y_train)
y_pred1 = tester_model1.predict(X_test)
cf1 = classification_report(y_test, y_pred1)
print("Level 0: sgd & bayes; Level 1: logistics")
print(cf1)

tester_model2 = get_stacking_2()
tester_model2.fit(X_train,y_train)
y_pred2 = tester_model2.predict(X_test)
cf2 = classification_report(y_test, y_pred2)
print("Level 0: sgd & random forest; Level 1: logistics")
print(cf2)

tester_model3 = get_stacking_3()
tester_model3.fit(X_train,y_train)
y_pred3 = tester_model3.predict(X_test)
cf3 = classification_report(y_test, y_pred3)
print("Level 0: bayes & random forest; Level 1: logistics")
print(cf3)


tester_model4 = get_stacking_4()
tester_model4.fit(X_train,y_train)
y_pred4 = tester_model4.predict(X_test)
cf4 = classification_report(y_test, y_pred4)
print("Level 0: logistics & bayes; Level 1: sgd")
print(cf4)



Level 0: sgd & bayes; Level 1: logistics
              precision    recall  f1-score   support

           0       0.73      0.82      0.77       333
           1       0.78      0.68      0.73       317

    accuracy                           0.75       650
   macro avg       0.75      0.75      0.75       650
weighted avg       0.75      0.75      0.75       650

Level 0: sgd & random forest; Level 1: logistics
              precision    recall  f1-score   support

           0       0.75      0.73      0.74       333
           1       0.72      0.74      0.73       317

    accuracy                           0.73       650
   macro avg       0.73      0.73      0.73       650
weighted avg       0.73      0.73      0.73       650

Level 0: bayes & random forest; Level 1: logistics
              precision    recall  f1-score   support

           0       0.73      0.79      0.76       333
           1       0.76      0.70      0.73       317

    accuracy                           0.