# **PDS Project Model**

Deployed using **Streamlit**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Balanced_BRI_data.csv")
df.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,date,source,month,year,BRIscore,BRIsentiment,tokenized_text
0,0,0,0,yuan clearing service officially launched in a...,2022-11-09 23:34:00,globaltimes,11,2022,0.7351,Positive,"['yuan', 'clearing', 'service', 'officially', ..."
1,2,2,2,simulating speed,2022-11-08 22:56:00,globaltimes,11,2022,0.4215,Positive,"['simulating', 'speed']"
2,4,4,4,china and latin america have more and more com...,2022-11-09 21:51:00,globaltimes,11,2022,0.5994,Positive,"['china', 'latin', 'america', 'common', 'langu..."
3,7,7,7,tackling climate change china and asean are im...,2022-11-07 16:12:00,globaltimes,11,2022,0.4404,Positive,"['tackling', 'climate', 'change', 'china', 'as..."
4,9,9,9,chinalaos railway a vivid example of highquali...,2022-11-07 19:40:00,globaltimes,11,2022,0.9299,Positive,"['chinalaos', 'railway', 'vivid', 'example', '..."


In [4]:
df.drop(['Unnamed: 0.2','Unnamed: 0.1','Unnamed: 0'], axis = 1, inplace=True)

In [5]:
df.BRIsentiment.value_counts()

Positive    7336
Neutral     4332
Negative    1669
Name: BRIsentiment, dtype: int64

In [6]:
df.shape

(13337, 8)

In [7]:
#import nltk, warnings, string
#warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [8]:
text = df.tokenized_text
tfidf = TfidfVectorizer()
text_feature = tfidf.fit_transform(text)

In [9]:
x = text_feature.toarray()
y = df.BRIsentiment

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=23)

In [11]:
len(y)

13337

In [12]:
# Training

#knn = KNeighborsClassifier(n_neighbors=43)
#knn.fit(x_train, y_train)

In [13]:
# Training Logistic Regression

lr = LogisticRegression(max_iter=1000)
lr.fit(x_train, y_train)

In [14]:
lr_pred = lr.predict(x_test)

print(confusion_matrix(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

accuracy = accuracy_score(y_test, lr_pred)
print("Accuracy:", accuracy)

[[  94  163  227]
 [  39  787  507]
 [  21  165 1999]]
              precision    recall  f1-score   support

    Negative       0.61      0.19      0.29       484
     Neutral       0.71      0.59      0.64      1333
    Positive       0.73      0.91      0.81      2185

    accuracy                           0.72      4002
   macro avg       0.68      0.57      0.58      4002
weighted avg       0.71      0.72      0.69      4002

Accuracy: 0.719640179910045


In [15]:
#knn_pred = knn.predict(x_test)

#print(confusion_matrix(y_test, knn_pred))
#print(classification_report(y_test, knn_pred))

#accuracy = accuracy_score(y_test, knn_pred)
#print("Accuracy:", accuracy)

In [16]:
# Naive Bayes
#from sklearn.naive_bayes import MultinomialNB

#naive_bayes = MultinomialNB()
#naive_bayes.fit(x_train, y_train)
#naive_pred = naive_bayes.predict(x_test)

In [17]:
#accuracy = accuracy_score(y_test, naive_pred)
#print("Accuracy:", accuracy)

In [18]:
#import pickle

In [19]:
#pickle_out = open("model.pkl", "wb")
#pickle.dump( lr, pickle_out)
#pickle_out.close()

In [20]:
#pickle_out2 = open("tfidf.pkl", "wb")
#pickle.dump( tfidf, pickle_out2)
#pickle_out2.close()

# **Metrics**

**Accuracy**

In [26]:
accuracy = accuracy_score(y_test, lr_pred)
print("Accuracy:", accuracy)

Accuracy: 0.719640179910045


**Precision**

In [23]:
from sklearn.metrics import precision_score

precision_macro = precision_score(y_test, lr_pred, average='macro')
print("Macro Precision: ", precision_macro)

Macro Precision:  0.6825499563593512


In [24]:
precision_micro = precision_score(y_test, lr_pred, average='micro')
print("Micro Precision: ", precision_micro)

Micro Precision:  0.719640179910045


In [25]:
precision_weighted = precision_score(y_test, lr_pred, average='weighted')
print("Weighted Precision: ", precision_weighted)

Weighted Precision:  0.7082647227684463


**Recall**

In [27]:
from sklearn.metrics import recall_score

recall_macro = recall_score(y_test, lr_pred, average='macro')
print("Macro Recall: ", recall_macro)

Macro Recall:  0.5664955391031127


In [28]:
recall_micro = recall_score(y_test, lr_pred, average='micro')
print("Micro Recall: ", recall_micro)

Micro Recall:  0.719640179910045


In [33]:
recall_weighted = recall_score(y_test, lr_pred, average='weighted')
print("Weighted Recall: ", recall_weighted)

Weighted Recall:  0.719640179910045


**F1-Score**

In [34]:
from sklearn.metrics import f1_score

f1_macro = f1_score(y_test, lr_pred, average='macro')
print("Macro f1-score: ", f1_macro)

Macro f1-score:  0.583525596272681


In [36]:
f1_micro = f1_score(y_test, lr_pred, average='micro')
print("Micro f1-score: ", f1_micro)

Micro f1-score:  0.719640179910045


In [35]:
f1_weighted = f1_score(y_test, lr_pred, average='weighted')
print("Weighted f1-score: ", f1_weighted)

Weighted f1-score:  0.6936435403196799


**Jaccard's Index**

In [37]:
from sklearn.metrics import jaccard_score

j_macro = jaccard_score(y_test, lr_pred, average='macro')
print("Macro Jaccard's Index: ", j_macro)

Macro Jaccard's Index:  0.44380954820579355


In [38]:
j_micro = jaccard_score(y_test, lr_pred, average='micro')
print("Micro Jaccard's Index: ", j_micro)

Micro Jaccard's Index:  0.5620608899297423


In [39]:
j_weighted = jaccard_score(y_test, lr_pred, average='weighted')
print("Weighted Jaccard's Index: ", j_weighted)

Weighted Jaccard's Index:  0.5526141576187793
