In [20]:
import os
import matplotlib.pyplot as plt
import cv2
import random
import numpy as np
import math
import pandas as pd
import string
import time
import math

In [21]:
#Key functions
def load_stop_words(stop_word_file):
        # The stop words list is taken from the following github page- https://github.com/Alir3z4/stop-words/blob/master/english.txt
        stop_words = []
        with open(stop_word_file, 'r', encoding='utf8') as f:
            stop_words = f.readlines()
            stop_words = [word.strip() for word in stop_words]
        return stop_words

def prepare_bag_of_words(train_data, stop_words, cutoff=1):
    deceptive_words = []
    truthful_words = []
    deceptive_sentences = []
    truthful_sentences = []

    for sentence, label in zip(train_data['full_text'], train_data['Target']):
        sentence = sentence.lower()
        # The following statement to remove punctuations is referred from- https://www.pythonpool.com/remove-punctuation-python/
        # Reference starts here
        sentence = sentence.translate(str.maketrans('','', string.punctuation))
        # Reference ends here
        sentence = sentence.split()
        sentence = [word for word in sentence if word not in stop_words]
        #Truthful=1
        #Deceptive=0
        if label == 1:
            truthful_words.extend(sentence)
            truthful_sentences.append(sentence)
        elif label == 0:
            deceptive_words.extend(sentence)
            deceptive_sentences.append(sentence)

    truthful_keys = list(set(truthful_words))
    deceptive_keys = list(set(deceptive_words))

    bag_of_words_truthful = {word:0 for word in truthful_keys}    
    for sentence in truthful_sentences:
        for word in truthful_keys:
            if word in sentence:
                bag_of_words_truthful[word] += 1
    
    bag_of_words_deceptive = {word:0 for word in deceptive_keys}
    for sentence in deceptive_sentences:
        for word in deceptive_keys:
            if word in sentence:
                bag_of_words_deceptive[word] += 1

    # Reduce bag of words based on cutoff
    bag_of_words_truthful = {k:v for k,v in bag_of_words_truthful.items() if v > cutoff}
    bag_of_words_deceptive = {k:v for k,v in bag_of_words_deceptive.items() if v > cutoff}

    # Calculate probability of truthful and deceptive
    p_of_truthful = len(truthful_sentences)/(len(truthful_sentences ) + len(deceptive_sentences))
    p_of_deceptive = len(deceptive_sentences)/(len(truthful_sentences) + len(deceptive_sentences))

    return len(truthful_sentences), len(deceptive_sentences), bag_of_words_deceptive, bag_of_words_truthful, p_of_truthful, p_of_deceptive

def prepare_test_message(sentence, stop_words):
    sentence = sentence.lower()
    sentence = sentence.translate(str.maketrans('','', string.punctuation))
    sentence = sentence.split()
    sentence = [word for word in sentence if word not in stop_words]
    return sentence  

def classifier(train_data, test_data):
    alpha = 1
    stop_words = load_stop_words("stop_words.txt")
    predicted = []

    no_of_truthful_msg, no_of_deceptive_msg, bag_of_words_deceptive, \
        bag_of_words_truthful, p_of_truthful, p_of_deceptive = prepare_bag_of_words(train_data, stop_words)

    for sentence in test_data['full_text']:
        sentence = prepare_test_message(sentence, stop_words) 
        p_of_word_given_truthful = 0
        p_of_word_given_deceptive = 0
        
        for word in sentence:
            p_of_word_given_truthful += math.log((bag_of_words_truthful.get(word, 0) + alpha)/(no_of_truthful_msg + alpha*len(bag_of_words_truthful)))
            p_of_word_given_deceptive += math.log((bag_of_words_deceptive.get(word, 0) + alpha)/(no_of_deceptive_msg + alpha*len(bag_of_words_deceptive)))

        # Applying log on probabilities
        p_of_truthful_given_word = p_of_word_given_truthful + math.log(p_of_truthful)
        p_of_deceptive_given_word = p_of_word_given_deceptive + math.log(p_of_deceptive)

        # if p_of_truthful_given_word - p_of_deceptive_given_word > 0:
        #     predicted.append(1)
        #     # print("Truthful")
        # else:
        #     # print("Deceptive")
        #     predicted.append(0)
        
        dummy_val_deceptive=np.exp(p_of_deceptive_given_word)
        dummy_val_truthful=np.exp(p_of_truthful_given_word)
        try:
            modified_val_spam=dummy_val_deceptive/(dummy_val_deceptive+dummy_val_truthful)
        except:
            modified_val_spam=0
        modified_val_truthful=1-modified_val_spam
        predicted.append(modified_val_truthful)
    return predicted

In [22]:
os.chdir(r"C:\Key files- GNA\Indiana University\Hackathon\Data Science- March")
train_dataset=pd.read_csv("New_train_dataset.csv")

train_dataset_filtered=train_dataset.drop(['Additional.Comments.x','Additional.Comments.y','sample_name','create_date','Unnamed: 0', 'Unnamed: 0.1','ID', 'user','userID','Sample.ID.x','Sample.ID.y','In.English.x', 'In.English.y','User.x', 'User.y'],axis=1)

#Cleaning
df=train_dataset_filtered
# merge two columns that are same just few values are in one column and rest in other
df['Is.About.the.Holocaust.x'] = df['Is.About.the.Holocaust.x'].fillna(df['Is.About.The.Holocaust.x'])
df['Is.About.the.Holocaust.y'] = df['Is.About.the.Holocaust.y'].fillna(df['Is.About.The.Holocaust.y'])
# drop the unwanted columns 
df.drop(['Is.About.The.Holocaust.x','Is.About.The.Holocaust.y'],inplace=True,axis=1)

# Average out to columns that have same meaning except are filled by different experts
# Approach used: average out the values 
df['Still.Exists'] = df[['Still.Exists.x', 'Still.Exists.y']].mean(axis=1)
df['Sarcasm'] = df[['Sarcasm.x', 'Sarcasm.y']].mean(axis=1)
df['Disagree.With'] = df[['Disagree.With.x', 'Disagree.With.y']].mean(axis=1)
df['Sentiment.Rating'] = df[['Sentiment.Rating.x', 'Sentiment.Rating.y']].mean(axis=1)
df['Is.About.the.Holocaust'] = df[['Is.About.the.Holocaust.x', 'Is.About.the.Holocaust.y']].mean(axis=1)
df['IHRA.Section'] = df[['IHRA.Section.x', 'IHRA.Section.y']].mean(axis=1)
# df['Antisemitism.Rating'] = df[['Antisemitism.Rating.x', 'Antisemitism.Rating.y']].mean(axis=1)
df['Calling.Out'] = df[['Calling.Out.x', 'Calling.Out.y']].mean(axis=1)

# drop the unwanted columns
df.drop(['Still.Exists.x', 'Still.Exists.y','Sarcasm.x', 'Sarcasm.y','Disagree.With.x', 'Disagree.With.y','Sentiment.Rating.x', 'Sentiment.Rating.y','Is.About.the.Holocaust.x', 'Is.About.the.Holocaust.y'],inplace=True,axis=1)
# df.drop(['IHRA.Section.x', 'IHRA.Section.y','Antisemitism.Rating.x', 'Antisemitism.Rating.y','Calling.Out.x', 'Calling.Out.y'],inplace=True,axis=1)
df.drop(['IHRA.Section.x', 'IHRA.Section.y','Calling.Out.x', 'Calling.Out.y'],inplace=True,axis=1)

#train_dataset_filtered.columns
#train_dataset_filtered.to_csv("train_dataset_filtered.csv", index=False)


train_dataset_filtered=df
train_two_cols=train_dataset_filtered.loc[:,['full_text','Target']]
#smaller_train_dataset=pd.concat([X_train,y_train],axis=1)

results= classifier(train_two_cols, pd.DataFrame(train_two_cols['full_text']))
results=pd.DataFrame(results,columns=['probabilities_of_1'])

fina_train=pd.concat([train_dataset_filtered,results],axis=1)

#Need to remove this 
#fina_train=fina_train.loc[:,['key','Target']]

#One hot encoding
key_df = pd.get_dummies(fina_train['key'], prefix='key_')
fina_train = pd.merge(
    left=fina_train,
    right=key_df,
    left_index=True,
    right_index=True,
    )
fina_train.drop(['full_text','key'],axis=1,inplace=True)
#fina_train.drop(['probabilities_of_1','full_text','Additional.Comments.x','Additional.Comments.y','sample_name','key','create_date'],axis=1,inplace=True)
#fina_train.drop(['key'],axis=1,inplace=True)


#fina_train=fina_train.loc[:,['key','Target']]

fina_train['RT_TF']=fina_train['RT_TF'].astype(int)

#fina_train.to_csv("fina_train.csv",index=False)

#Splitiing data
y=fina_train.loc[:,['Target']]
X=fina_train.drop(['Target'],axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [23]:
#RandomForest
from sklearn.ensemble import RandomForestClassifier
classifier2 = RandomForestClassifier(n_estimators = 500, criterion = 'entropy', random_state = 0)
classifier2.fit(X_train, y_train)

  classifier2.fit(X_train, y_train)


RandomForestClassifier(criterion='entropy', n_estimators=500, random_state=0)

In [24]:
y_pred = classifier2.predict(X_test)

In [25]:
#Evaluation of model
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy_score(y_test, y_pred)

1.0

In [26]:
from sklearn.metrics import classification_report
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00       488\n           1       1.00      1.00      1.00       156\n\n    accuracy                           1.00       644\n   macro avg       1.00      1.00      1.00       644\nweighted avg       1.00      1.00      1.00       644\n'

In [27]:
#------------------------------------------------------Test data----------------------------------------
os.chdir(r"C:\Key files- GNA\Indiana University\Hackathon\Data Science- March")
test_dataset=pd.read_csv("test_dataset_latest.csv")
#test_dataset_filtered=test_dataset

test_dataset_filtered=test_dataset.drop(['Additional.Comments.x','Additional.Comments.y','sample_name','create_date','Unnamed: 0', 'Unnamed: 0.1','ID', 'user','userID','Sample.ID.x','Sample.ID.y','In.English.x', 'In.English.y','User.x', 'User.y'],axis=1)

#Cleaning
df=test_dataset_filtered
# merge two columns that are same just few values are in one column and rest in other
df['Is.About.the.Holocaust.x'] = df['Is.About.the.Holocaust.x'].fillna(df['Is.About.The.Holocaust.x'])
df['Is.About.the.Holocaust.y'] = df['Is.About.the.Holocaust.y'].fillna(df['Is.About.The.Holocaust.y'])
# drop the unwanted columns 
df.drop(['Is.About.The.Holocaust.x','Is.About.The.Holocaust.y'],inplace=True,axis=1)

# Average out to columns that have same meaning except are filled by different experts
# Approach used: average out the values 
df['Still.Exists'] = df[['Still.Exists.x', 'Still.Exists.y']].mean(axis=1)
df['Sarcasm'] = df[['Sarcasm.x', 'Sarcasm.y']].mean(axis=1)
df['Disagree.With'] = df[['Disagree.With.x', 'Disagree.With.y']].mean(axis=1)
df['Sentiment.Rating'] = df[['Sentiment.Rating.x', 'Sentiment.Rating.y']].mean(axis=1)
df['Is.About.the.Holocaust'] = df[['Is.About.the.Holocaust.x', 'Is.About.the.Holocaust.y']].mean(axis=1)
df['IHRA.Section'] = df[['IHRA.Section.x', 'IHRA.Section.y']].mean(axis=1)
# df['Antisemitism.Rating'] = df[['Antisemitism.Rating.x', 'Antisemitism.Rating.y']].mean(axis=1)
df['Calling.Out'] = df[['Calling.Out.x', 'Calling.Out.y']].mean(axis=1)

# drop the unwanted columns
df.drop(['Still.Exists.x', 'Still.Exists.y','Sarcasm.x', 'Sarcasm.y','Disagree.With.x', 'Disagree.With.y','Sentiment.Rating.x', 'Sentiment.Rating.y','Is.About.the.Holocaust.x', 'Is.About.the.Holocaust.y'],inplace=True,axis=1)
# df.drop(['IHRA.Section.x', 'IHRA.Section.y','Antisemitism.Rating.x', 'Antisemitism.Rating.y','Calling.Out.x', 'Calling.Out.y'],inplace=True,axis=1)
df.drop(['IHRA.Section.x', 'IHRA.Section.y','Calling.Out.x', 'Calling.Out.y'],inplace=True,axis=1)

#test_dataset_filtered.columns
#test_dataset_filtered.to_csv("test_dataset_filtered.csv", index=False)

test_two_cols=test_dataset_filtered.loc[:,['full_text']]
#smaller_test_dataset=pd.concat([X_test,y_test],axis=1)

results= classifier(train_two_cols, pd.DataFrame(test_two_cols['full_text']))
results=pd.DataFrame(results,columns=['probabilities_of_1'])

fina_test=pd.concat([test_dataset_filtered,results],axis=1)

#One hot encoding
key_df = pd.get_dummies(fina_test['key'], prefix='key_')
fina_test = pd.merge(
    left=fina_test,
    right=key_df,
    left_index=True,
    right_index=True,
)
#fina_test.drop(['full_text','Additional.Comments.x','Additional.Comments.y','sample_name','key','create_date'],axis=1,inplace=True)

fina_test.drop(['full_text','key'],axis=1,inplace=True)

fina_test['RT_TF']=fina_test['RT_TF'].astype(int)

# RF
y_pred_final_test = classifier2.predict(fina_test)
y_pred_final_test =pd.DataFrame(y_pred_final_test,columns=['Target'])
y_pred_final_test.to_csv("final_submission_10_lr.csv",index=False)