In [None]:
pip install sentence_transformers

# Imports

In [2]:
import pandas as pd
import numpy as np
import spacy
from scipy import stats

from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


from sentence_transformers import SentenceTransformer, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

import torch 
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch import nn
import re

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Read

In [4]:
# Loading the database
url_train = "/content/drive/MyDrive/F1_Claim_Detection_train.csv"
df_train = pd.read_csv(url_train)

url_test = "/content/drive/MyDrive/F1_Claim_Detection_test.csv"
df_test = pd.read_csv(url_test)

In [5]:
# data preprocessing

def preprocess(text):
  text = re.sub(r"@\w+\b", " ", text)
  text = re.sub(r"https?:\/\/\w*|\w+\.com\w*", " ", text)
  text = re.sub("<\w*>", " ", text)
  text = re.sub(r"\\n", " ", text)
  text = re.sub("\s+", " ", text)
  return text
df_train["tweet"] = df_train["tweet"].apply(lambda x: preprocess(x))
df_test["tweet"] = df_test["tweet"].apply(lambda x: preprocess(x))

# Model

In [None]:
# Creating the embeddings

def get_feature_model(data_frame):
  non_cont_model2 = SentenceTransformer('sentence-transformers/all-distilroberta-v1')
  feature1 = non_cont_model2.encode(data_frame["tweet"])
  return feature1
    


feature_1_train = get_feature_model(df_train)

In [None]:
# Making the train-test split and initializing the classifiers
X_train, Y_train = np.array(feature_1_train), df_train["label"]
X_train, X_test, Y_train, Y_test = train_test_split(np.array(feature_1_train), df_train["label"], test_size = 0.2)
clf_svc = SVC(gamma='auto')
clf_lr = LogisticRegression(random_state=0)
clf_mlp = MLPClassifier(random_state=1, max_iter=1000, early_stopping = True)
clf_adaboost = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 2), n_estimators=100, random_state=0)
# clf = make_pipeline(LogisticRegression(random_state=0))

# Training all the models
clf_svc.fit(X_train, Y_train)
clf_lr.fit(X_train, Y_train)
clf_mlp.fit(X_train, Y_train)
clf_adaboost.fit(X_train, Y_train)

In [8]:
# Making predictions
pred_test_lr = clf_lr.predict(X_test)
pred_train_lr = clf_lr.predict(X_train)

pred_test_mlp = clf_mlp.predict(X_test)
pred_train_mlp = clf_mlp.predict(X_train)

pred_test_svc = clf_svc.predict(X_test)
pred_train_svc = clf_svc.predict(X_train)

pred_test_adaboost = clf_adaboost.predict(X_test)
pred_train_adaboost = clf_adaboost.predict(X_train)

In [9]:
# feature_1_test = get_feature_model2(df_test)

# X_test, Y_test = np.array(feature_1_test), df_test["label"]

## Logistic Regression

In [10]:
print("==============================================================================")
print("TRAINING")
print(accuracy_score(pred_train_lr, Y_train))

print("==============================================================================")
print("TESTING")
print(accuracy_score(pred_test_lr, Y_test))


# print("==============================================================================")
# print("TRAINING")
# # pred_train = clf.predict(X_train)
# print(classification_report(pred_train, Y_train))
print("==============================================================================")
print("CLASSIFICATION REPORT")
print("TESTING")
# pred_test = clf.predict(X_test)
print(classification_report(pred_test_lr, Y_test))

print("==============================================================================")

TRAINING
0.889226914817466
TESTING
0.8683834048640916
CLASSIFICATION REPORT
TESTING
              precision    recall  f1-score   support

           0       0.08      0.88      0.14        17
           1       1.00      0.87      0.93      1381

    accuracy                           0.87      1398
   macro avg       0.54      0.88      0.53      1398
weighted avg       0.99      0.87      0.92      1398



## SVC

In [11]:
print("==============================================================================")
print("TRAINING")
print(accuracy_score(pred_train_svc, Y_train))

print("==============================================================================")
print("TESTING")
print(accuracy_score(pred_test_svc, Y_test))


# print("==============================================================================")
# print("TRAINING")
# # pred_train = clf.predict(X_train)
# print(classification_report(pred_train, Y_train))
print("==============================================================================")
print("CLASSIFICATION REPORT")
print("TESTING")
# pred_test = clf.predict(X_test)
print(classification_report(pred_test_svc, Y_test))

print("==============================================================================")

TRAINING
0.8775948460987831
TESTING
0.8590844062947067
CLASSIFICATION REPORT
TESTING
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.86      0.92      1398

    accuracy                           0.86      1398
   macro avg       0.50      0.43      0.46      1398
weighted avg       1.00      0.86      0.92      1398



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## MLP

In [12]:
print("==============================================================================")
print("TRAINING")
print(accuracy_score(pred_train_mlp, Y_train))

print("==============================================================================")
print("TESTING")
print(accuracy_score(pred_test_mlp, Y_test))


# print("==============================================================================")
# print("TRAINING")
# # pred_train = clf.predict(X_train)
# print(classification_report(pred_train, Y_train))
print("==============================================================================")
print("CLASSIFICATION REPORT")
print("TESTING")
# pred_test = clf.predict(X_test)
print(classification_report(pred_test_mlp, Y_test))

print("==============================================================================")

TRAINING
0.8920901932712957
TESTING
0.8683834048640916
CLASSIFICATION REPORT
TESTING
              precision    recall  f1-score   support

           0       0.10      0.76      0.17        25
           1       1.00      0.87      0.93      1373

    accuracy                           0.87      1398
   macro avg       0.55      0.82      0.55      1398
weighted avg       0.98      0.87      0.91      1398



## Adaboost

In [13]:
print("==============================================================================")
print("TRAINING")
print(accuracy_score(pred_train_adaboost, Y_train))

print("==============================================================================")
print("TESTING")
print(accuracy_score(pred_test_adaboost, Y_test))


# print("==============================================================================")
# print("TRAINING")
# # pred_train = clf.predict(X_train)
# print(classification_report(pred_train, Y_train))
print("==============================================================================")
print("CLASSIFICATION REPORT")
print("TESTING")
# pred_test = clf.predict(X_test)
print(classification_report(pred_test_adaboost, Y_test))

print("==============================================================================")

TRAINING
0.98067287043665
TESTING
0.8404864091559371
CLASSIFICATION REPORT
TESTING
              precision    recall  f1-score   support

           0       0.25      0.40      0.31       124
           1       0.94      0.88      0.91      1274

    accuracy                           0.84      1398
   macro avg       0.59      0.64      0.61      1398
weighted avg       0.88      0.84      0.86      1398



#Final Output

In [14]:
feature_1_test = get_feature_model(df_test)

X_test= np.array(feature_1_test)



In [None]:
pred_y = clf.predict(X_test)
df_out = pd.DataFrame()
df_out["label"] =  pred_y

df_out['id'] = df_out.index

In [None]:
df_out.to_csv('output.csv')