**Απαλακτική Εργασία - Τεχνικές Εξόρυξης Δεδομένων Ουρανία Βουρτζούμη 1115201600024**

In [1]:
import os, glob
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split


path_train = os.path.join("c://","/home/rania/Επιφάνεια εργασίας/Τεχνικές Εξόρυξης Δεδομένων/απαλακτική/data/train.csv")
path_labels = os.path.join("c://","/home/rania/Επιφάνεια εργασίας/Τεχνικές Εξόρυξης Δεδομένων/απαλακτική/data/impermium_verification_labels.csv")
path_set = os.path.join("c://","/home/rania/Επιφάνεια εργασίας/Τεχνικές Εξόρυξης Δεδομένων/απαλακτική/data/impermium_verification_set.csv")

In [2]:
train = pd.read_csv(path_train)
labels = pd.read_csv(path_labels)
sets = pd.read_csv(path_set)

**Προεπεξεργασία και καθάρισμα δεδομένων**

In [3]:
def clean(comment):
    tokens = nltk.word_tokenize(comment)

    tokens = [w.lower() for w in tokens]
    
    tokens = [word for word in tokens if word.isalpha() and len(word) > 1]
    cln = " "
    cln = cln.join(tokens)
    return cln

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

def NaiveBayes_prediction(xtrain, ytrain ,xtest , ytest, ngrams, multinominal):
    
    vectorizer = CountVectorizer(ngram_range=(1, ngrams))
    X_train = vectorizer.fit_transform(xtrain)
    X_train = X_train.todense()
    
    if multinominal == 0:
        model = GaussianNB()
    else:
        model = MultinomialNB(alpha=1.0)

    model.fit(X_train, ytrain)
    
    X_test = xtest
    X_test = vectorizer.transform(X_test)
    X_test = X_test.todense()

    predictions = model.predict(X_test)
    
    Y_test = ytest

    print('Accuracy score: ', accuracy_score(ytest, predictions))
    print('F1 score: ', f1_score(Y_test, predictions))

#     print('Precision score: ', precision_score(ytest, predictions))
#     print('Recall score: ', recall_score(ytest, predictions))

In [6]:
train['Comment'] = train['Comment'].apply(lambda x: clean(x))
sets['Comment'] = sets['Comment'].apply(lambda x: clean(x))

**Classification με NaiveBayes**

In [7]:
NaiveBayes_prediction(train['Comment'], train['Insult'], sets['Comment'], labels['Insult'], 1, 0)

Accuracy score:  0.5221476510067115
F1 score:  0.5291005291005291


**Βελτιστοποίηση NaiveBayes**

**1.Lemmatization**

In [8]:
from nltk.stem import WordNetLemmatizer

def lemmatization(comment):
    tokens = nltk.word_tokenize(comment)
    
    lemmatizer = WordNetLemmatizer() 
    lemmas = [lemmatizer.lemmatize(word) for word in tokens]
    
    cln = " "
    cln = cln.join(tokens)
    return cln

In [9]:
train_lemmatization = train['Comment'].apply(lambda x: lemmatization(x))
sets_lemmatization = sets['Comment'].apply(lambda x: lemmatization(x))

**Prediction - Accuracy**

In [10]:
NaiveBayes_prediction(train_lemmatization, train['Insult'], sets_lemmatization, labels['Insult'], 1, 0)

Accuracy score:  0.5221476510067115
F1 score:  0.5291005291005291


**2.Αφαίρεση των Stopwords**

In [11]:
def stop_words(comment):
    tokens = nltk.word_tokenize(comment)
    
    tokens = [word for word in tokens if not word in stopwords.words('english')]
    
    cln = " "
    cln = cln.join(tokens)
    return cln

In [12]:
train_stopwords = train['Comment'].apply(lambda x: stop_words(x))
sets_stopwords = sets['Comment'].apply(lambda x: stop_words(x))

**Prediction - Accuracy**

In [13]:
NaiveBayes_prediction(train_stopwords, train['Insult'], sets_stopwords, labels['Insult'], 1, 0)

Accuracy score:  0.5217002237136465
F1 score:  0.5284516982796648


**3.Χρήση Bigrams**

In [14]:
NaiveBayes_prediction(train['Comment'], train['Insult'], sets['Comment'], labels['Insult'], 2, 0)

Accuracy score:  0.5700223713646533
F1 score:  0.48361096184846863


**4.Χρήση Laplace Smoothing**

In [15]:
NaiveBayes_prediction(train['Comment'], train['Insult'], sets['Comment'], labels['Insult'], 1, 1)

Accuracy score:  0.680089485458613
F1 score:  0.6202867764206055


**POS Based Features**

In [16]:
from nltk import pos_tag, map_tag

def PosTag(comment):

    tokens = nltk.word_tokenize(comment)
    pos_tags = pos_tag(tokens)
    simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos_tags]
    return simplifiedTags

In [17]:
pos_train = train['Comment'].apply(lambda x: PosTag(x))
pos_sets = sets['Comment'].apply(lambda x: PosTag(x))

In [18]:
from collections import Counter
# btb==1 for beat the benchmark, inorder to take into account the full tagging
def posFrequency(comment, btb):
    tokens = nltk.word_tokenize(comment)
    tags = nltk.pos_tag(tokens)
    simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tags]

    counts = Counter( tag for word,  tag in simplifiedTags)
    cnt = []
    simplecounts = {}

    if btb == 0 :
        simptags = ['NOUN', 'ADV', 'VERB', 'ADJ']
        for entry in list(counts):
            if entry not in simptags:
                counts.pop(entry)
    for entry in counts:
        l = counts[entry]
        counts.update({entry: counts[entry]/len(tokens) - l})
        cnt.append(entry)

    if btb == 0 :
        other_tags = set(simptags) - set(cnt)
        for i in other_tags:
            counts[i] = 0
    return counts

In [20]:
poscount_train = train['Comment'].apply(lambda x: posFrequency(x, 0))
poscount_sets = sets['Comment'].apply(lambda x: posFrequency(x, 0))

In [21]:
df_train = pd.DataFrame(poscount_train.values.tolist(), index=poscount_train.index)
df_sets = pd.DataFrame(poscount_sets.values.tolist(), index=poscount_sets.index)

In [22]:
freq_array_train = df_train.to_numpy()
freq_array_sets = df_sets.to_numpy()

**TF-IDF Based Features**

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 1000, max_df=1.0, min_df = 2)
train_vec = vectorizer.fit_transform(train['Comment'])
tfidf_array_train = train_vec.toarray()

# --------------------------------------------------------------------------------

# vectorizer = TfidfVectorizer(max_features = 1000, max_df=1.0, min_df = 2)
test_vec = vectorizer.transform(sets['Comment'])
tfidf_array_test = test_vec.toarray()

**Final Array** - tfidf+pos

In [24]:
final_df = pd.DataFrame()
df_trainvec = pd.DataFrame(tfidf_array_train)

final_df_train = pd.concat([df_trainvec, df_train], axis=1)

# --------------------------------------------------------------------------------

final_df_test = pd.DataFrame()
df_testvec = pd.DataFrame(tfidf_array_test)

final_df_test = pd.concat([df_testvec, df_sets], axis=1)

**SVM**

In [25]:
from sklearn.svm import SVC

model = SVC(kernel='linear', probability=True)
model.fit(final_df_train, train['Insult'])

X_test = final_df_test

predictions = model.predict(X_test)

Y_test = labels['Insult']

print('Accuracy score: ', accuracy_score(Y_test, predictions))
print('F1 score: ', f1_score(Y_test, predictions))
# print('Recall score: ', recall_score(Y_test, predictions))

Accuracy score:  0.6782997762863535
F1 score:  0.5874928284566839


**Random Decision Tree**

In [26]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 50, random_state = 42)
model.fit(final_df_train, train['Insult'] )

X_test = final_df_test

predictions = model.predict(X_test)

Y_test = labels['Insult']

print('Accuracy score: ', accuracy_score(Y_test, predictions))
print('F1 score: ', f1_score(Y_test, predictions))

# print('Precision score: ', precision_score(Y_test, predictions))
# print('Recall score: ', recall_score(Y_test, predictions))

Accuracy score:  0.6317673378076063
F1 score:  0.44802146210596916


**Beat the benchmark** -με SVM

In [27]:
# lemmatization
train_lemmatization = train['Comment'].apply(lambda x: lemmatization(x))
sets_lemmatization = sets['Comment'].apply(lambda x: lemmatization(x))

# remove stopwords
train_stopwords = train_lemmatization.apply(lambda x: stop_words(x))
sets_stopwords = sets_lemmatization.apply(lambda x: stop_words(x))

# full tagging
poscount_train = train_stopwords.apply(lambda x: posFrequency(x, 1))
poscount_sets = sets_stopwords.apply(lambda x: posFrequency(x, 1))

In [28]:
# tfidf with new cleaned data

vectorizer = TfidfVectorizer(max_features = 1000, max_df=1.0, min_df = 2)
train_vec = vectorizer.fit_transform(train_stopwords)
tfidf_array_train = train_vec.toarray()

# --------------------------------------------------------------------------------

# vectorizer = TfidfVectorizer(max_features = 1000, max_df=1.0, min_df = 2)
test_vec = vectorizer.transform(sets_stopwords)
tfidf_array_test = test_vec.toarray()

# create combined dataframe with cleaned data

final_df = pd.DataFrame()
df_trainvec = pd.DataFrame(tfidf_array_train)

final_df_train = pd.concat([df_trainvec, df_train], axis=1)

# --------------------------------------------------------------------------------

final_df_test = pd.DataFrame()
df_testvec = pd.DataFrame(tfidf_array_test)

final_df_test = pd.concat([df_testvec, df_sets], axis=1)

In [29]:
# # oversampling
# from imblearn.over_sampling import SMOTE

# sm = SMOTE(sampling_strategy=150/100000, random_state=42,)
# x_train_res, y_train_res = sm.fit_sample(final_df_train.to_numpy,  labels['Insult'])
# vgazei error

In [30]:
from sklearn.svm import SVC

model = SVC(kernel='linear', probability=True, class_weight={0:1, 1:2})
model.fit(final_df_train, train['Insult'])

X_test = final_df_test

predictions = model.predict(X_test)

Y_test = labels['Insult']

print('Accuracy score: ', accuracy_score(Y_test, predictions))
print('F1 score: ', f1_score(Y_test, predictions))

Accuracy score:  0.6850111856823267
F1 score:  0.643002028397566
