<h2> Theme classification</h2>

In [None]:
#Importing requires libraries
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import math
from nltk.corpus import stopwords
from sklearn.utils import shuffle
from	sklearn.model_selection	import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

<h3>Reading datasets</h3>

In [None]:
df1=pd.read_excel("/content/dev_data_article.xlsx")
df2=pd.read_excel("/content/dev_data_tweet.xlsx")
df3=pd.read_excel("/content/evaluation_data.xlsx")

In [None]:
start = time.time()

In [None]:
#Clean data in tweets
def clean(df,feature):
    for i in range(len(df[feature])):

        if 'RT' in df[feature][i]:
            df[feature][i]=df[feature][i].split('RT')[1]
        if 'QT' in df[feature][i]:
            df[feature][i]=df[feature][i].split('QT')[1]

In [None]:
!pip install contractions
import contractions



<h3>Common cleaning for articles and tweets</h3>

In [None]:
#Common cleaning for articles and tweets
def preprocess(df , feature):
  
  #removing urls
  
  df[feature]  = df[feature].apply(lambda x: re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE))

  #expanding contractions
  df[feature] = df[feature].apply(lambda x:contractions.fix(x))

  #lower case
  df[feature] = df[feature].apply(lambda x:x.lower())

  #removing digits and punctuations
  df[feature] = df[feature].apply(lambda x: re.sub('\w*\d\w*','', x))
  df[feature] = df[feature].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))

  #removing newlines
  df[feature] = df[feature].apply(lambda x: x.replace("\n", ""))

  # removing extra spaces
  df[feature] = df[feature].apply(lambda x: re.sub(' +',' ',x))

In [None]:
preprocess(df1,'Text')
preprocess(df1,'Headline')
clean(df2,'Tweet')
preprocess(df2,'Tweet')
clean(df3,'Text')
preprocess(df3,'Text')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


<h3>using headlines instead of articles to train the model in case of too short or meaningless articles</h3>

In [None]:
#using headlines instead of articles in case of too short or meaningless articles

bad_strings = ["by continuing to use the site you agree to the use of cookies you can find out more by clicking this link",
               "we use cookies for analytics advertising and to improve our site you agree to our use of cookies by continuing to use our site to know more see our cookie policy and cookie settings"
              ]

for ind, i in df1.iterrows():
    if (len(i["Text"]) < 60 or i["Text"] in bad_strings) and len(i["Headline"])!=0 :
        df1["Text"][ind] = df1["Headline"][ind]
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


<h3>combining artciles and tweets to make a large training dataset</h3>

In [None]:
#combining artciles and tweets to make a large training dataset
df1 = df1.drop("Headline",1)
df2 = df2.rename(columns={"Tweet_ID" : "Text_ID" , "Tweet" : "Text", "Mobile_Tech_Tag":"Mobile_Tech_Flag"})
df_combine =  pd.concat([df1,df2],0)

In [None]:
indexes=[]
for i in range(len(df_combine['Text'])):
  indexes.append(i)
df_combine=df_combine.set_index(np.asarray(indexes))

In [None]:
!pip install -U sentence-transformers
!pip install transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.7/dist-packages (1.0.3)


<h3> Generating Embeddings</h3>

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
query_embeddings = model.encode(df_combine['Text'])
test_embeddings = model.encode(df3['Text'])
print(query_embeddings.shape)
print(test_embeddings.shape)

(8000, 512)
(432, 512)


In [None]:
# Use cuda if present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cuda


In [None]:
rand =20
embedded = query_embeddings
labels = df_combine['Mobile_Tech_Flag']
embedded, label = shuffle(embedded, labels, random_state=rand)

In [None]:
X_train = embedded
Y_train = label.values
X_test = test_embeddings
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)

(8000, 512)
(8000,)
(432, 512)


<h3>Training and Predicting</h3>

In [None]:
X_train_t = torch.tensor(X_train).to(device)
Y_train_t = torch.tensor(Y_train).to(device)
test_data = torch.tensor(X_test).to(device)

rnd_clf = RandomForestClassifier()
svm_clf = SVC(kernel="rbf",probability=True)
voting_clf = VotingClassifier(
 estimators=[
             ('rf', rnd_clf), ('svc', svm_clf) ],
 voting='soft'
 )
voting_clf.fit(X_train_t.cpu(), Y_train_t.cpu())
y_pred_vote = voting_clf.predict(test_data.cpu())
y_pred_vote

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
end= time.time()
print("Runtime:",end-start)

Runtime: 99.06230044364929
