<a href="https://colab.research.google.com/github/stefarine/DMML2022_ROLEX/blob/main/with_doc2vec_0_63.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Importation

In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv('training_data.csv')
df_pred = pd.read_csv('unlabelled_test_data.csv')

In [32]:
df_pred

Unnamed: 0,id,sentence
0,0,Nous dûmes nous excuser des propos que nous eû...
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,2,"Et, paradoxalement, boire froid n'est pas la b..."
3,3,"Ce n'est pas étonnant, car c'est une saison my..."
4,4,"Le corps de Golo lui-même, d'une essence aussi..."
...,...,...
1195,1195,C'est un phénomène qui trouve une accélération...
1196,1196,Je vais parler au serveur et voir si on peut d...
1197,1197,Il n'était pas comme tant de gens qui par pare...
1198,1198,Ils deviennent dangereux pour notre économie.


In [5]:
df.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [6]:
df_pred.head()
X_pred = df_pred['sentence']
X_pred

0       Nous dûmes nous excuser des propos que nous eû...
1       Vous ne pouvez pas savoir le plaisir que j'ai ...
2       Et, paradoxalement, boire froid n'est pas la b...
3       Ce n'est pas étonnant, car c'est une saison my...
4       Le corps de Golo lui-même, d'une essence aussi...
                              ...                        
1195    C'est un phénomène qui trouve une accélération...
1196    Je vais parler au serveur et voir si on peut d...
1197    Il n'était pas comme tant de gens qui par pare...
1198        Ils deviennent dangereux pour notre économie.
1199    Son succès a généré beaucoup de réactions néga...
Name: sentence, Length: 1200, dtype: object

In [7]:
np.random.seed = 0

In [8]:
y = df['difficulty']
X = df['sentence']

# Baseline

In [None]:
base_rate = (df['difficulty'].value_counts().max()/df['difficulty'].shape[0]).round(4)
base_rate

0.1694

# Logistic Regression (whithout data cleaning)


In [29]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


In [None]:
# Using default tokenizer in TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 1))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=0)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

# Fit model on training set
pipe.fit(X_train, y_train)

KeyboardInterrupt: ignored

In [30]:
################################
#WHICH AVERAGE MODE IS BETTER ?#
################################

# Evaluate the model
def evaluate(true, pred):
    precision = precision_score(true, pred,average='macro')
    recall = recall_score(true, pred,average='macro')
    f1 = f1_score(true, pred,average='macro')
    print(f"CONFUSION MATRIX:\n{confusion_matrix(true, pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(true, pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")

In [None]:
# Predictions
y_pred = pipe.predict(X_test)

# Evaluation - test set
evaluate(y_test, y_pred)

CONFUSION MATRIX:
[[93 31 21 10  4  2]
 [54 60 30  6  6  8]
 [12 38 64 17  9 20]
 [ 6  6 15 66 27 24]
 [ 4  4 10 37 73 45]
 [ 7  8  8 19 24 92]]
ACCURACY SCORE:
0.4667
CLASSIFICATION REPORT:
	Precision: 0.4645
	Recall: 0.4677
	F1_Score: 0.4640


In [None]:
predictions = pipe.predict(X_pred)

predictions = pd.DataFrame(predictions,columns=['difficulty'])

predictions.to_csv("LogisticRegression.csv")

# KNN (whithout data cleaning)

In [None]:
# Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

# Define classifier
classifier_knn = KNeighborsClassifier()

# Create pipeline
pipe_knn = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_knn)])

# Fit model on training set
pipe_knn.fit(X_train, y_train)

# Predictions
y_pred_knn = pipe_knn.predict(X_test)

evaluate(y_test, y_pred_knn)

CONFUSION MATRIX:
[[121  28   8   1   1   2]
 [ 98  51  12   1   1   1]
 [ 81  39  33   3   1   3]
 [ 49  30  19  29   3  14]
 [ 48  36  29  15  29  16]
 [ 37  29  17  23   9  43]]
ACCURACY SCORE:
0.3187
CLASSIFICATION REPORT:
	Precision: 0.4007
	Recall: 0.3183
	F1_Score: 0.3022


In [None]:
predictions_knn = pipe_knn.predict(X_pred)

predictions_knn = pd.DataFrame(predictions_knn,columns=['difficulty'])

predictions_knn.to_csv("KNN.csv")

## KNN improved

In [None]:
# Grid Search - hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Define parameters to test
grid = {'n_neighbors':np.arange(1,100),
        'p':np.arange(1,3),
        'weights':['uniform','distance']}

# Define and fit model
knn = KNeighborsClassifier()
classifier_knn_plus = GridSearchCV(knn, grid, cv=10)

pipe_knn_plus = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_knn_plus)])

pipe_knn_plus.fit(X_train, y_train)

print("Hyperparameters:", classifier_knn_plus.best_params_)

Hyperparameters: {'n_neighbors': 4, 'p': 2, 'weights': 'distance'}


In [None]:
# Predictions
y_pred_knn_plus = pipe_knn_plus.predict(X_test)

evaluate(y_test, y_pred_knn_plus)


CONFUSION MATRIX:
[[116  27  14   1   1   2]
 [ 79  64  15   4   1   1]
 [ 65  38  44   8   2   3]
 [ 37  25  23  40   2  17]
 [ 37  30  23  26  36  21]
 [ 34  22  19  13  17  53]]
ACCURACY SCORE:
0.3677
CLASSIFICATION REPORT:
	Precision: 0.4227
	Recall: 0.3678
	F1_Score: 0.3575


In [None]:
predictions_knn_plus = pipe_knn_plus.predict(X_pred)

predictions_knn_plus = pd.DataFrame(predictions_knn_plus,columns=['difficulty'])

predictions_knn_plus.to_csv("Knn_plus.csv")

# Decision Tree Classifier (without data cleaning)

In [None]:
# Import Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

# Define classifier
classifier_dtc = DecisionTreeClassifier()

# Create pipeline
pipe_dtc = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_dtc)])

# Fit model on training set
pipe_dtc.fit(X_train, y_train)

# Predictions
y_pred_dtc = pipe_dtc.predict(X_test)

evaluate(y_test, y_pred_dtc)

CONFUSION MATRIX:
[[82 35 25  8  2  9]
 [44 54 39 15  4  8]
 [28 38 38 23 16 17]
 [ 7 20 27 40 28 22]
 [10 19 33 36 40 35]
 [12 12 28 38 32 36]]
ACCURACY SCORE:
0.3021
CLASSIFICATION REPORT:
	Precision: 0.3021
	Recall: 0.3022
	F1_Score: 0.2994


In [None]:
predictions_dtc = pipe_dtc.predict(X_pred)

predictions_dtc = pd.DataFrame(predictions_dtc,columns=['difficulty'])

predictions_dtc.to_csv("DecisionTreeClassifier.csv")

# Random Forest Classifier (without data cleaning)

In [None]:
# Use random forest
from sklearn.ensemble import RandomForestClassifier

# Define classifier
classifier_rfc = RandomForestClassifier()

# Create pipeline
pipe_rfc = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier_rfc)])

# Fit model on training set
pipe_rfc.fit(X_train, y_train)

# Predictions
y_pred_rfc = pipe_rfc.predict(X_test)

evaluate(y_test, y_pred_rfc)

CONFUSION MATRIX:
[[121  22  11   4   2   1]
 [ 82  51  18  11   2   0]
 [ 35  39  54  18   7   7]
 [ 18  11  13  65  24  13]
 [ 12  10  29  52  47  23]
 [ 15  10  17  32  23  61]]
ACCURACY SCORE:
0.4156
CLASSIFICATION REPORT:
	Precision: 0.4250
	Recall: 0.4182
	F1_Score: 0.4059


In [None]:
predictions_rfc = pipe_rfc.predict(X_pred)

predictions_rfc = pd.DataFrame(predictions_rfc,columns=['difficulty'])

predictions_rfc.to_csv("RandomForestClassifier.csv")

# Remove stopwords



In [10]:
# Install and update spaCy
!pip install -U spacy

# Download the French language model
!python -m spacy download fr

!python -m spacy download fr_core_news_sm

!pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git &> /dev/null


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2022-12-18 00:35:18.320508: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'fr' are deprecated. Please use the
full pipeline package name 'fr_core_news_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.4.0/fr_core_news_sm-3.4.0-py3-none-any.whl (16.3 MB)
[K     |████████████████████████████████| 16.3 MB 1.5 MB/s 
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
2022-12-18 00:35:31.778872: E tensorflow/stream_executo

In [11]:
# Import required packages
import spacy
from spacy import displacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import nltk
import string
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

nlp = spacy.load('fr_core_news_sm')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [35]:
#Convert to lowercase
df['sentence'] = df['sentence'].str.lower()
df_pred['sentence'] = df_pred['sentence'].str.lower()
# Define the function to remove the punctuation
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, " ")
    return text


# Remove punctuation
df['sentence'] = df['sentence'].apply(remove_punctuations)
df_pred['sentence'] = df_pred['sentence'].apply(remove_punctuations)
df_pred.head()

Unnamed: 0,id,sentence
0,0,nous dûmes nous excuser des propos que nous eû...
1,1,vous ne pouvez pas savoir le plaisir que j ai ...
2,2,et paradoxalement boire froid n est pas la b...
3,3,ce n est pas étonnant car c est une saison my...
4,4,le corps de golo lui même d une essence aussi...


In [13]:
#Define list of stopwords

french_stopwords = spacy.lang.fr.stop_words.STOP_WORDS
french_stopwords.add('a')
french_stopwords.add('c')
french_stopwords.add('d')
french_stopwords.add('e')
french_stopwords.add('j')
french_stopwords.add('l')
french_stopwords.add('m')
french_stopwords.add('n')
french_stopwords.add('s')
french_stopwords.add('t')
french_stopwords.add('y')
french_stopwords.add('qu')

In [37]:
#Remove stopwords
df['sentence'] = df['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (french_stopwords)]))
df_pred['sentence'] = df_pred['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (french_stopwords)]))
df_pred.head(10)

Unnamed: 0,id,sentence
0,0,dûmes excuser propos eûmes prononcés
1,1,pouvez savoir plaisir recevoir bonne nouvelle
2,2,paradoxalement boire froid bonne parade
3,3,étonnant saison mystérieuse
4,4,corps golo essence surnaturelle monture arrang...
5,5,jeta cri petit cri voulut dresser débattre rep...
6,6,madame monsieur fils léo arrive jours retard é...
7,7,trouvé repas midi
8,8,racine mal bel bien penser tendant manichéisme...
9,9,madame


In [15]:
#tokenize
nlp = spacy.load('fr_core_news_sm')

df['tocken_without_stopwords'] = [nlp(text) for text in df.sentence]
df.head()

Unnamed: 0,id,sentence,difficulty,tocken_without_stopwords
0,0,coûts kilométriques réels diverger sensiblemen...,C1,"(coûts, kilométriques, réels, diverger, sensib..."
1,1,bleu couleur préférée aime vert,A1,"(bleu, couleur, préférée, aime, vert)"
2,2,test niveau français site internet école,A1,"(test, niveau, français, site, internet, école)"
3,3,mari boston,A1,"(mari, boston)"
4,4,écoles commerce couloirs places financières ar...,B1,"(écoles, commerce, couloirs, places, financièr..."


In [16]:
#add a columns with the lengh 
df['len'] = [len(token) for token in df.tocken_without_stopwords]
df.head()

Unnamed: 0,id,sentence,difficulty,tocken_without_stopwords,len
0,0,coûts kilométriques réels diverger sensiblemen...,C1,"(coûts, kilométriques, réels, diverger, sensib...",21
1,1,bleu couleur préférée aime vert,A1,"(bleu, couleur, préférée, aime, vert)",5
2,2,test niveau français site internet école,A1,"(test, niveau, français, site, internet, école)",6
3,3,mari boston,A1,"(mari, boston)",2
4,4,écoles commerce couloirs places financières ar...,B1,"(écoles, commerce, couloirs, places, financièr...",19


#Stemming

In [17]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='french')

def return_stem(sentence):
    doc = nlp(sentence)
    return [stemmer.stem(X.text) for X in doc]

In [39]:
df['stemmed'] = [return_stem(text) for text in df.sentence]
df_pred['stemmed'] = [return_stem(text) for text in df_pred.sentence]

In [19]:
df['label'] = pd.factorize(df['difficulty'])[0] + 1

In [None]:
df_pred.head()

In [20]:
df.head()

Unnamed: 0,id,sentence,difficulty,tocken_without_stopwords,len,stemmed,label
0,0,coûts kilométriques réels diverger sensiblemen...,C1,"(coûts, kilométriques, réels, diverger, sensib...",21,"[coût, kilometr, réel, diverg, sensibl, valeur...",1
1,1,bleu couleur préférée aime vert,A1,"(bleu, couleur, préférée, aime, vert)",5,"[bleu, couleur, préfer, aim, vert]",2
2,2,test niveau français site internet école,A1,"(test, niveau, français, site, internet, école)",6,"[test, niveau, franc, sit, internet, écol]",2
3,3,mari boston,A1,"(mari, boston)",2,"[mar, boston]",2
4,4,écoles commerce couloirs places financières ar...,B1,"(écoles, commerce, couloirs, places, financièr...",19,"[écol, commerc, couloir, plac, financi, arriv,...",3


#Word2vec


In [2]:
!pip install gensim
!pip install python-Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.20.8-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.20.8
  Downloading Levenshtein-0.20.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (174 kB)
[K     |████████████████████████████████| 174 kB 6.3 MB/s 
[?25hCollecting rapidfuzz<3.0.0,>=2.3.0
  Downloading rapidfuzz-2.13.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 46.0 MB/s 
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.20.8 python-Levenshtein-0.20.8 rapidfuzz-2.13.6


In [3]:
import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import utils
import csv
from tqdm import tqdm
import multiprocessing
import nltk

In [64]:
documents = []
testdocuments = []
for ind in df.index:
  documents.append(TaggedDocument(df['stemmed'][ind],
      [df['label'][ind]]))

for ind in df_pred.index:
  testdocuments.append(TaggedDocument(df_pred['stemmed'][ind],
      tags=None))
testdocuments

[TaggedDocument(words=['dûm', 'excus', 'propos', 'eûm', 'prononc'], tags=None),
 TaggedDocument(words=['pouv', 'savoir', 'plais', 'recevoir', 'bon', 'nouvel'], tags=None),
 TaggedDocument(words=['paradoxal', 'boir', 'froid', 'bon', 'parad'], tags=None),
 TaggedDocument(words=['éton', 'saison', 'mystéri'], tags=None),
 TaggedDocument(words=['corp', 'golo', 'essenc', 'surnaturel', 'montur', 'arrang', 'obstacl', 'matériel', 'objet', 'gên', 'rencontr', 'pren', 'ossatur', 'rend', 'intérieur', 'fût', 'bouton', 'port', 'adapt', 'aussitôt', 'surnag', 'invincibl', 'rob', 'roug', 'figur', 'pâl', 'nobl', 'mélancol', 'laiss', 'paraîtr', 'aucun', 'troubl', 'transvertebr'], tags=None),
 TaggedDocument(words=['jet', 'cri', 'pet', 'cri', 'voulut', 'dress', 'débattr', 'repouss', 'ced', 'forc', 'eût', 'manqu', 'résist'], tags=None),
 TaggedDocument(words=['madam', 'monsieur', 'fil', 'léo', 'arriv', 'jour', 'retard', 'écol'], tags=None),
 TaggedDocument(words=['trouv', 'rep', 'mid'], tags=None),
 TaggedD

In [57]:
document_train, document_test = train_test_split(documents, test_size=0.2, random_state=0)

In [59]:
cores = multiprocessing.cpu_count()

model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(document_train)])
document = utils.shuffle(document_train)
model_dbow.train(documents, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs)
def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=100)) for doc in sents])
    return targets, feature_vectors

def vector_for_pred(model, input_docs):
    sents = input_docs
    feature_vectors = [(model.infer_vector(doc.words, steps=100)) for doc in sents]
    return feature_vectors


100%|██████████| 3840/3840 [00:00<00:00, 1106266.05it/s]


In [60]:
y_train, X_train = vector_for_learning(model_dbow, document_train)
y_test, X_test = vector_for_learning(model_dbow, document_test)

In [61]:
# Fit model on training set
logregCV = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=0)
logregCV.fit(X_train, y_train)

# Predictions
y_pred = logregCV.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [62]:
evaluate(y_test, y_pred)

CONFUSION MATRIX:
[[ 98   7  11  10  23  24]
 [  3 122  11  23   1   1]
 [  7  23  88  19  14   9]
 [  4  47  17  90   4   2]
 [  8   5   9   8 103  11]
 [ 19   7   5   4  20 103]]
ACCURACY SCORE:
0.6292
CLASSIFICATION REPORT:
	Precision: 0.6338
	Recall: 0.6317
	F1_Score: 0.6284


In [65]:
testdocuments = vector_for_pred(model_dbow, testdocuments)
LogRegPredCV = logregCV.predict(testdocuments)

In [75]:
predictions = pd.DataFrame(columns=['difficulty'])
predictions['label'] = LogRegPredCV
predictions['difficulty'] = predictions['label'].replace([1,2,3,4,5,6],['A1','A2','B1','B2','C1','C2'])
predictions = predictions.drop(columns=['label'])
predictions

Unnamed: 0,difficulty
0,C2
1,B2
2,B1
3,C1
4,C2
...,...
1195,A1
1196,B2
1197,C2
1198,A1


In [76]:
predictions.to_csv("doc2vec.csv")

#Ensemble TEST

In [None]:
#https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble
#https://towardsdatascience.com/ensemble-methods-or-democracy-for-ai-bac2fa129f61

#https://github.com/crownpku/text2vec

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=0)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LRCV1 = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [None]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=43)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LRCV2 = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=12)

In [None]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=12)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LRCV3 = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=33)

In [None]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=33)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LRCV4 = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=20)

In [None]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=20)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LRCV5 = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
soft_voting_clf = VotingClassifier(estimators=[('LCRV1', pipe_LRCV1), ('LCRV2', pipe_LRCV2), ('LCRV3', pipe_LRCV3), ('LCRV4', pipe_LRCV4), ('LCRV5', pipe_LRCV5)], voting='soft')

In [None]:
soft_voting_clf.fit(X_train, y_train) # training

VotingClassifier(estimators=[('LCRV1',
                              Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                                              ('classifier',
                                               LogisticRegressionCV(cv=5,
                                                                    max_iter=1000,
                                                                    random_state=0))])),
                             ('LCRV2',
                              Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                                              ('classifier',
                                               LogisticRegressionCV(cv=5,
                                                                    max_iter=1000,
                                                                    random_state=43))])),
                             ('LCRV3',
                              Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                         

In [None]:
predictions = soft_voting_clf.predict(X_pred)

predictions = pd.DataFrame(predictions,columns=['difficulty'])

predictions.to_csv("ensembleLRCV5x5v2.csv")

#LinearSVC

In [None]:
from sklearn.svm import LinearSVC

In [None]:
# Define classifier
classifier = LinearSVC()

In [None]:
# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

# Fit model on training set
pipe.fit(X, y)

Pipeline(steps=[('vectorizer', TfidfVectorizer()), ('classifier', LinearSVC())])

In [None]:
predictions = pipe.predict(X_pred)

predictions = pd.DataFrame(predictions,columns=['difficulty'])

predictions.to_csv("LinearSVC.csv")

#MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()

In [None]:
# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

# Fit model on training set
pipe.fit(X, y)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', MultinomialNB())])

In [None]:
predictions = pipe.predict(X_pred)

predictions = pd.DataFrame(predictions,columns=['difficulty'])

predictions.to_csv("MultinomialNB.csv")

#LRCV5 + LinearSVC + MultinomialNB

In [None]:
# Define classifier
classifier = LogisticRegressionCV(solver='lbfgs', cv=5, max_iter=1000, random_state=0)


# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LRCV = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [None]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier


In [None]:
classifier2 = SklearnClassifier(SVC(kernel='linear',probability=True))

# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_LinearSVC = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [None]:
# Define classifier
classifier3 = MultinomialNB()

# Create pipeline
## The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
pipe_MultinomialNB = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
soft_voting_clf = VotingClassifier(estimators=[ ('LRCV', classifier),('MultinomialNB', classifier3)], voting='soft')

In [None]:
X_t = tfidf.fit_transform(X)
X_t = pd.DataFrame(X_t.todense(),
    columns=tfidf.get_feature_names())



In [None]:
soft_voting_clf.fit(X_t, y) # training

In [None]:
X_pred = tfidf.transform(X_pred)
X_pred = pd.DataFrame(X_pred.todense(),
    columns=tfidf.get_feature_names())

In [None]:
predictions = soft_voting_clf.predict(X_pred)

predictions = pd.DataFrame(predictions,columns=['difficulty'])

predictions.to_csv("combi_LinearSVC_MultinomialNB.csv")

AttributeError: ignored