# Data mining project:

this projet consist of ...

## Importing tools:
First of all we import our libraries the we will use in our projet like in the cell below:

In [343]:
import nltk
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC,SVC
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
import collections
import xml.etree.ElementTree as ET
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from os import listdir
from os.path import isfile, join
import shutil

### Download the required datasets:
To use NLTK (Natural Language ToolKit) we need to download the following datasets to use them for preprocessing (tokenization,stopwords,lemmatization...).

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ami\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ami\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ami\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Arranging the data folder:
we start by reading the CSV files to retrieve the files that hasn't a class yet to move them into a new folder so that the classification process will be good as the train.csv file. as a result we get **(4800 file that is in the folder data and 200 in test_set folder).** the function bellow is only executed on the first time.

## Cleaning process:
In this process firt of all we parse each XML file then we tokenize it so that it will be represented as a list in python then we lemmatize each word and then we do the stemming process to get the word stem, So the variable **cleaned_docs** will contains a list of document that we cleaned. 

In [440]:
################# docs that we are going to train on####################
df = pd.read_csv('train.csv',delimiter=",")
files=df['file']
Y=df['earnings: 0 no/ 1 yes']
cleaned_docs=[]
for f in files:
    stemmer = PorterStemmer()
    lemmatiser = WordNetLemmatizer()
    s=open("data/"+f,"r")
    dom = ET.parse(s)
    root = dom.getroot()
    doc_word=str(root.text)
    token = word_tokenize(doc_word)
    clean_tokens = []
    for word in token:
        clean_tokens.append(stemmer.stem(lemmatiser.lemmatize(word)))
    s = ' '.join(clean_tokens)
    cleaned_docs.append(s)

In [441]:
#################### docs to predict their classes######################
df = pd.read_csv('test.csv',delimiter=",")
files=df['file']
docs=[]
for f in files:
    stemmer = PorterStemmer()
    lemmatiser = WordNetLemmatizer()
    s=open("data/"+f,"r")
    dom = ET.parse(s)
    root = dom.getroot()
    doc_word=str(root.text)
    token = word_tokenize(doc_word)
    clean_tokens = []
    for word in token:
        clean_tokens.append(stemmer.stem(lemmatiser.lemmatize(word)))
    s = ' '.join(clean_tokens)
    docs.append(s)

## Features extraction (TF-IDF):
in order to get features from our text files we user this scikit function to tokenize and calculate TF-iDF of each word at the 
end the variables **X** and **docs** will contains 

a matrix **(docs-features)** of TF-IDF values such as **X** will be the our dataset with it's known classes and **docs** the dataset of docs that we will predict its classes.

**TFIDFVectorizer params:**
* **min_df=10 :**
* **max_df=0.9 :**
* **stop_words:**
* **ngram_range:**
* **lowercase:**
* **token_pattern:**


In [382]:
tfidf=TfidfVectorizer(stop_words=stopwords.words('english'),min_df=10,max_df=0.9,ngram_range=(1,2),lowercase=True,token_pattern=r'(?u)\b[A-Za-z]+\b'
)
tfidf.fit(cleaned_docs)
X=tfidf.transform(t).toarray()
docs=tfidf.transform(docs).toarray()

In [385]:
X.shape # we had 5398 features

(4800, 5398)

## Spliting training/test sets:
We split the our dataset into training set and test set and we used the default parameters so that we will have 75% (3600 docs)of training set and 25% (1200 docs) of test set.

In [387]:
X_train, X_test, y_train, y_test = train_test_split(X, Y) # split test set=> 25% and train set 75%

In [388]:
clf=LinearSVC()

clf.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [389]:
clf.predict(docs)

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0], dtype=int64)

In [390]:
a=clf.predict(docs)
collections.Counter(a)

Counter({0: 151, 1: 49})

In [391]:
clf.score(X_train,y_train)

0.9819444444444444

In [392]:
clf.score(X_test,y_test)

0.9733333333333334

In [393]:
scores = cross_val_score(clf, X, labels, cv=5)
scores

array([0.94166667, 0.97916667, 0.96770833, 0.96354167, 0.96145833])

In [394]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.96 (+/- 0.02)


In [395]:
p=clf.predict(X_test)

In [396]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,p))  
print(classification_report(y_test,p))  
print(accuracy_score(y_test, p))  

[[959   7]
 [ 25 209]]
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       966
          1       0.97      0.89      0.93       234

avg / total       0.97      0.97      0.97      1200

0.9733333333333334


In [397]:
from sklearn.naive_bayes import GaussianNB
clf=GaussianNB()
clf.fit(X_train,y_train)


GaussianNB(priors=None)

In [399]:
clf.predict(docs)

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1], dtype=int64)

In [400]:
clf.score(X_train,y_train)

0.9138888888888889

In [401]:
clf.score(X_test,y_test)

0.8841666666666667

In [402]:
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores

array([0.85852982, 0.87361111, 0.86805556, 0.85555556, 0.85952712])

In [403]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.86 (+/- 0.01)


In [404]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
p=clf.predict(X_test)
print(confusion_matrix(y_test,p))  
print(classification_report(y_test,p))  
print(accuracy_score(y_test, p))  

[[872  94]
 [ 45 189]]
             precision    recall  f1-score   support

          0       0.95      0.90      0.93       966
          1       0.67      0.81      0.73       234

avg / total       0.90      0.88      0.89      1200

0.8841666666666667


In [405]:
a=clf.predict(XX)
collections.Counter(a)

Counter({0: 135, 1: 65})

In [406]:
from sklearn.neighbors import KNeighborsClassifier

In [407]:
knn = KNeighborsClassifier(n_neighbors=5)

In [408]:
knn.fit(X,Y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [409]:
knn.score(X,Y)

0.9664583333333333

In [410]:
knn.predict(docs)

array([0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0], dtype=int64)

In [411]:
a=knn.predict(docs)
collections.Counter(a)

Counter({0: 145, 1: 55})

In [412]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
p=knn.predict(X_test)
print(confusion_matrix(y_test,p))  
print(classification_report(y_test,p))  
print(accuracy_score(y_test, p))  

[[954  12]
 [ 20 214]]
             precision    recall  f1-score   support

          0       0.98      0.99      0.98       966
          1       0.95      0.91      0.93       234

avg / total       0.97      0.97      0.97      1200

0.9733333333333334


In [435]:
from sklearn import svm
clf = svm.SVC(kernel='linear',C=1.0)
clf.fit(X_train, y_train)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [436]:
clf.predict(docs)

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0], dtype=int64)

In [437]:
p=clf.predict(X_test)
a=clf.predict(docs)
collections.Counter(a)

Counter({0: 150, 1: 50})

In [438]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
p=clf.predict(X_test)
print(confusion_matrix(y_test,p))  
print(classification_report(y_test,p))  
print(accuracy_score(y_test, p))  

[[928  12]
 [ 30 230]]
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       940
          1       0.95      0.88      0.92       260

avg / total       0.96      0.96      0.96      1200

0.965


In [439]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
parameters = {'C':[1, 10]}
clf = GridSearchCV(clf, parameters, cv=5)

In [420]:
clf.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1, param_grid={'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [421]:
clf.score(X_train,y_train)

0.9769444444444444

In [422]:
clf.score(X_test,y_test)

0.9758333333333333

In [423]:
clf.predict(docs)

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0], dtype=int64)

In [424]:
clf.predict(X_test)
a=clf.predict(docs)
collections.Counter(a)

Counter({0: 151, 1: 49})

In [328]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
p=clf.predict(X_test)
print(confusion_matrix(y_test,p))  
print(classification_report(y_test,p))  
print(accuracy_score(y_test, p))  

[[942  10]
 [ 29 219]]
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       952
          1       0.96      0.88      0.92       248

avg / total       0.97      0.97      0.97      1200

0.9675


In [434]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=4)
kf.get_n_splits(X)

print(kf)  

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    print(clf.fit(X_train,y_train).score(X_test,y_test))

KFold(n_splits=4, random_state=None, shuffle=False)
TRAIN: [1200 1201 1202 ... 4797 4798 4799] TEST: [   0    1    2 ... 1197 1198 1199]
0.95
TRAIN: [   0    1    2 ... 4797 4798 4799] TEST: [1200 1201 1202 ... 2397 2398 2399]
0.9691666666666666
TRAIN: [   0    1    2 ... 4797 4798 4799] TEST: [2400 2401 2402 ... 3597 3598 3599]
0.9716666666666667
TRAIN: [   0    1    2 ... 3597 3598 3599] TEST: [3600 3601 3602 ... 4797 4798 4799]
0.965
