In [1]:
from simple_features import get_length, check_opinion_verbs, get_subjectivity_score

In [2]:
import os.path

import nltk as nltk
import pandas as pd

SAMPLE_ARTICLES_DIR = 'sample_articles/'

# WHOLE DATASET
ARTICLES_DIR = 'IBM_Debater_(R)_CE-EMNLP-2015.v3/articles/'


def transform_files_to_dataframes(articles_file, claims_file, evidences_file):
    # Get text of all articles
    articles_dataframe = pd.read_csv(articles_file, sep="	")
    # Get all claims
    claims_dataframe = pd.read_csv(claims_file, sep="	")
    # Get all evidences
    evidences_dataframe = pd.read_csv(evidences_file, sep="	")
    # Add names of columns to evidence dataframe
    evidences_dataframe.columns = ['Topic', 'Claim', 'Evidence', 'Evidence Type']
    return articles_dataframe, claims_dataframe, evidences_dataframe

# if we do not use evidences_dataframe, maybe delete it?

In [3]:


def get_labelled_sentences_from_data(articles_file, claims_file, evidences_file):
    articles_dataframe, claims_dataframe, evidences_dataframe = transform_files_to_dataframes(
        articles_file, claims_file, evidences_file
    )
    # print(f"Number of articles {len(articles_dataframe.Title)}")
    # claim_or_text = "Claim original text"
    # print(f"Number of claims {len(claims_dataframe.get(claim_or_text))}")
    # ev = "Evidence"
    # print(f"Number of evidences {len(evidences_dataframe.get(ev))}")

    directory = os.fsencode('IBM_Debater_(R)_CE-EMNLP-2015.v3/articles/')
    article_no_claims = []
    number_of_claims = 0
    number_of_evidences = 0
    X = []
    Y = []

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        with open(os.path.join(directory, file), 'r') as txt_file:
            txt = txt_file.read().replace('\n', '')
            sentences = nltk.tokenize.sent_tokenize(txt)
            # Get topic from article id
            art_id = int(filename[6:-4])
            topic = articles_dataframe.loc[articles_dataframe['article Id'] == art_id, 'Topic']
            if len(topic) > 0:
                # Get all claims for this topic
                claims_to_topic = claims_dataframe.loc[claims_dataframe['Topic'] == topic.item()]  # select rows from a df based on values in column
                list_claims_ori = claims_to_topic['Claim original text'].tolist()
                list_claims_cor = claims_to_topic['Claim corrected version'].tolist()
                for index, row in claims_to_topic.iterrows():
                    claim = row['Claim original text']
                    # print(index)
                    # print(claim)
                    if claim in evidences_dataframe.values:
                        evidence = evidences_dataframe.loc[evidences_dataframe['Claim'] == claim, 'Evidence']
                        # print(claim, evidence)
                        number_of_evidences += 1
                for sentence in sentences:
                    X.append(sentence)
                    if any(s in sentence for s in list_claims_ori) or any(s in sentence for s in list_claims_cor):
                        # Label sentence as 'with claim'
                        Y.append(1)
                        number_of_claims += 1
                    else:
                        # Label sentence as 'without claim'
                        Y.append(0)
            else:
                article_no_claims.append(art_id)
                continue
    return X, Y

In [4]:



sentences,  labels = get_labelled_sentences_from_data(
    'IBM_Debater_(R)_CE-EMNLP-2015.v3/articles.txt', 'IBM_Debater_(R)_CE-EMNLP-2015.v3/claims.txt',
    'IBM_Debater_(R)_CE-EMNLP-2015.v3/evidence.txt'
)

# Downsampling

In [5]:
# import random
# downsampled_sentences = []
# downsampled_labels = []


# for i in range(0, len(sentences)):
#     if labels[i] == 1:
#         downsampled_sentences.append(sentences[i])
#         downsampled_labels.append(labels[i])
        
# not_claims = list(set(sentences) - set(downsampled_sentences))

# # we do not need indices of corresponding labels because we know that they are 0
# inputNumbers = range(0, len(not_claims))

# random_not_cailms = random.sample(inputNumbers , 3000)
# for i in random_not_cailms:
#     downsampled_sentences.append(not_claims[i])
#     downsampled_labels.append(0)
    
        

In [6]:
# sentences = downsampled_sentences
# labels = downsampled_labels

In [7]:
from tqdm import tqdm

from main import stopwords, lemmatize_sentence, lemmatizer, get_wordnet_pos, remove_stopwords
from new_version import get_labelled_sentences_from_data


def preprocess_sentences(sentences):
    X_preprocessed = []

    for sentence in tqdm(sentences):
        clean_sentence = remove_stopwords(sentence)
        # lemmatized_sentence = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in clean_sentence.split()]
        lemmatized_sentence = lemmatize_sentence(clean_sentence)  # Doesn't work, returns list of list
        X_preprocessed.append(' '.join(lemmatized_sentence))

#     save_data(X_preprocessed, 'preprocessed_sentences.txt')
    print(f"The number of preprocessed sentences is {len(X_preprocessed)}.")
#     print(f"The number of labels is {len(y)}.")
    print("The sentences have been saved and will be available as 'preprocessed_sentences.txt'")

    return X_preprocessed

In [8]:
preprocessed_sent = preprocess_sentences(sentences)

100%|████████████████████████████████████| 75620/75620 [06:03<00:00, 208.26it/s]

The number of preprocessed sentences is 75620.
The sentences have been saved and will be available as 'preprocessed_sentences.txt'





In [9]:
print(preprocessed_sent)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [10]:
SEED = 10

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

from sklearn.utils import shuffle



X = preprocessed_sent
y = labels
indexes = [i for i in range(len(X))]
X, y, indexes = shuffle(X, y, indexes, random_state=SEED)


In [11]:
# Divide data in train and test
X_sent_train, X_rest, y_train, y_rest, orig_train, orig_rest = train_test_split(
        X,
        y,
        indexes,
        test_size=.2,
        random_state=SEED
    )

# X_rest, y_rest will be used for testing later! 
# We now need to divide our training set again: into trianing and dev set.


In [12]:

# And again, to get dev set
X_train_dev, X_test_dev, y_train_dev, y_test_dev, orig_dev, orig_test = train_test_split(
        X_sent_train,
        y_train,
        orig_train,
        test_size=.3,
        random_state=SEED
    )

In [13]:
print(len(y_test_dev))

18149


In [14]:


y_train = np.array(y_train_dev)
y_dev = np.array(y_test_dev)


In [15]:
from nltk import word_tokenize

In [68]:
# def get_length(sentence):
#     # or true if longer than 4 words?
#     if len(sentence) > 4:
#         return 1
#     return 0

# Preparing feture matrix for train set

In [69]:
# The firts column of the feature matrix is 0 if sent lenth is <= 4, otherwise 1

# for i in range(len(X_sent_train)):               
#     X_train[i, 0] = get_length(word_tokenize(X_sent_train[i]))




In [70]:
# The second column of the feature matrix is 0 if there are no opinion verbs in the sentence 
# and is 1 if there is at least 1 opinion verbs in the sentence.

# for i in range(len(X_sent_train)):               
#     X_train[i, 1] = check_opinion_verbs(word_tokenize(X_sent_train[i]))

In [71]:
# The third column of the feature matrix contains subjectivity scores.

# for i in range(len(X_sent_train)):               
#     X_train[i, 2] = get_subjectivity_score(word_tokenize(X_sent_train[i]))

# Preparing feture matrix for test set

In [72]:
# x_rest = np.zeros((len(X_sent_rest), 3))

In [73]:
# The firts column of the feature matrix is 0 if sent lenth is <= 4, otherwise 1

# for i in range(len(X_sent_rest)):               
#     x_rest[i, 0] = get_length(word_tokenize(X_sent_rest[i]))

In [74]:

# for i in range(len(X_sent_rest)):               
#     x_rest[i, 1] = check_opinion_verbs(word_tokenize(X_sent_rest[i]))

In [75]:

# for i in range(len(X_sent_rest)):               
#     x_rest[i, 2] = get_subjectivity_score(word_tokenize(X_sent_rest[i]))

# # Vectorizer

In [76]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_dev)

In [77]:
print(X_train.shape)

(42347, 34770)


In [78]:
print(type(X_train))

<class 'scipy.sparse.csr.csr_matrix'>


In [79]:
x_dev = vectorizer.transform(X_test_dev)

In [28]:
#  from sklearn.svm import SVC

# svm = SVC(kernel='linear', random_state=SEED, max_iter=25)
# svm.fit(X_train, y_train)
# binary_balanc_predictions = svm.predict(x_dev)
# # binary_balanc_probs = svm.predict_proba(x_dev)
# print(f'Accuracy on the training set: {svm.score(X_train, y_train)}')
# print(f'Accuracy on the test set: {svm.score(x_dev, y_dev)}')

# print("SVM Accuracy Score ->", accuracy_score(binary_balanc_predictions,y_dev) * 100)


In [29]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm = SVC(C =1.0, kernel='linear', degree = 3, gamma = 'auto', probability =True, random_state=SEED)
svm.fit(X_train, y_train)
binary_balanc_predictions = svm.predict(x_dev)
# binary_balanc_probs = svm.predict_proba(x_dev)
print(f'Accuracy on the training set: {svm.score(X_train, y_train)}')
print(f'Accuracy on the test set: {svm.score(x_dev, y_dev)}')

# print("SVM Accuracy Score ->", accuracy_score(binary_balanc_predictions,y_dev) * 100)

Accuracy on the training set: 0.9626556016597511
Accuracy on the test set: 0.7596774193548387


In [30]:
svm.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': True,
 'random_state': 10,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [34]:
# Execute this line (of course, modifying the path)
import sys
sys.path.insert(0, '/Users/Margot/Desktop/DataScience/_functions_/_functions_/')

In [31]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
# from wordcloud import WordCloud
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# import plotly.express as px
import seaborn as sns
%matplotlib inline
from sklearn_crfsuite import metrics
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
from string import ascii_uppercase
from pandas import DataFrame
import seaborn as sn
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from collections import Counter
from sklearn.svm import SVR, SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_curve, auc
from sklearn.metrics import roc_auc_score, classification_report
from IPython.display import Markdown, display

In [40]:
pip install sklearn_crfsuite 

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp37-cp37m-macosx_10_9_x86_64.whl (180 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: python-crfsuite, tabulate, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.8 sklearn_crfsuite-0.3.6 tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [33]:
# Step 1. Define the param_grid 
param_grid = {'C': [0.01, 0.1, 0.5, 1, 10, 100], 
              'gamma': [1, 0.75, 0.5, 0.25, 0.1, 0.01, 0.001], 
              'kernel': ['rbf', 'poly', 'linear']}

In [39]:
# Step 2. GridSearch and fit the model
grid = GridSearchCV(SVC(), param_grid, cv=3)
grid.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=SVC(),
             param_grid={'C': [0.01, 0.1, 0.5, 1, 10, 100],
                         'gamma': [1, 0.75, 0.5, 0.25, 0.1, 0.01, 0.001],
                         'kernel': ['rbf', 'poly', 'linear']})

In [40]:
best_params= grid.best_params_
best_params

{'C': 10, 'gamma': 0.75, 'kernel': 'rbf'}

In [None]:




# param_grid={'C': [0.01, 0.1, 0.5, 1, 10, 100],
#                          'gamma': [1, 0.75, 0.5, 0.25, 0.1, 0.01, 0.001],
#                          'kernel': ['rbf', 'poly', 'linear']})



#1) BEST PARAM = {'C': 10, 'gamma': 0.75, 'kernel': 'rbf'}
# if cv = 3

# Accuracy on the training set: 1.0
# Accuracy on the test set: 0.7758064516129032  but C = 3, 6 the same


#2) BEST PARAM = {'C': 10, 'gamma': 0.5, 'kernel': 'rbf'}
# if cv = 2



In [35]:
best_params1 = grid.best_params_
best_params1

{'C': 10, 'gamma': 0.5, 'kernel': 'rbf'}

In [102]:
weights = {0: 1.03, 1: 34.4}
# I got the weight by getting the prob, for example prob of class 0 is  41117 /(1230+41117), 
# the inverse probability is 1/ 41117 /(1230+41117), which is 1.03.

#  The alternative is to give "balanced" to the parameter "class_weight", the re.lation between the weights will
#  the same, for example 0.5 for class 0 and 17 for class 1

In [103]:
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score


# 'rbf' to linear

# # class_weight = 'balanced'

svm2 = SVC(C = 10, gamma= 0.75, kernel = 'rbf', class_weight = weights, random_state=SEED)
svm2.fit(X_train, y_train)
binary_balanc_predictions = svm2.predict(x_dev)
# binary_balanc_probs = svm.predict_proba(x_dev)
print(f'Accuracy on the training set: {svm2.score(X_train, y_train)}')
print(f'Accuracy on the test set: {svm2.score(x_dev, y_dev)}')

# print("SVM Accuracy Score ->", accuracy_score(binary_balanc_predictions,y_dev) * 100)

Accuracy on the training set: 0.9988665076628805
Accuracy on the test set: 0.9725053721968152


In [104]:
print(svm2.class_weight_)

[ 1.03 34.4 ]


In [80]:
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score


# # class_weight = 'balanced'

# svm2 = SVC(random_state=SEED)
# svm2.fit(X_train, y_train)
# binary_balanc_predictions = svm2.predict(x_dev)
# # binary_balanc_probs = svm.predict_proba(x_dev)
# print(f'Accuracy on the training set: {svm2.score(X_train, y_train)}')
# print(f'Accuracy on the test set: {svm2.score(x_dev, y_dev)}')

# # print("SVM Accuracy Score ->", accuracy_score(binary_balanc_predictions,y_dev) * 100)

Accuracy on the training set: 0.9754173849387205
Accuracy on the test set: 0.9715686814700535


In [43]:
# from sklearn.neural_network import MLPClassifier


# mlp = MLPClassifier(verbose=True, random_state=SEED, max_iter=15)
# mlp.fit(X_train, y_train)
# binary_balanc_predictions = mlp.predict(x_dev)
# binary_balanc_probs = mlp.predict_proba(x_dev)

In [None]:
print(f'Accuracy on the training set: {mlp.score(X_train, y_train)}')
print(f'Accuracy on the test set: {mlp.score(x_dev, y_dev)}')

In [None]:
print(binary_balanc_predictions)

In [None]:
for i in range(len(binary_balanc_predictions)):
    pred_label = binary_balanc_predictions[i]
    real_label = y_rest[i]
    if pred_label != real_label:
        # get the original raw text with the corresponding index
        orig_index = orig_rest[i]
        print(f'Inorrect {pred_label} at sentence: \n    {sentences[orig_index]}\n')

In [108]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             precision_recall_fscore_support, ConfusionMatrixDisplay,
                             classification_report, confusion_matrix, RocCurveDisplay)

# print(classification_report(y_dev, binary_balanc_predictions))


TypeError: classification_report() missing 1 required positional argument: 'y_pred'