In [38]:
import sys
sys.path.append("../src")
import pandas as pd
import numpy as np

from preprocess import Preprocessing

import nltk
from nltk.corpus import stopwords


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from sklearn.model_selection import train_test_split

# Import datas

In [39]:
train_df = pd.read_csv("../datas/train_data.csv")

# Only keep the sex column

In [40]:
train = train_df.drop(["date_accident", "date_consolidation", "ID", "filename"], axis=1)

# Clean datas

In [41]:
def remove_newlines(df):
    df = df.replace("\n", '', regex=True)
    return df

In [42]:
train = remove_newlines(train)
preprocess = Preprocessing(train)

train = preprocess.remove_stopwords()
train = preprocess.remove_newlines()

train.head()

Unnamed: 0,texte,sexe
0,Le : 12/11/2019 Cour d’appel d’Agen chambre so...,homme
1,Le : 12/11/2019 Cour d’appel d’Agen chambre ci...,homme
2,Le : 12/11/2019 Cour d’appel d’Agen Audience p...,femme
3,Le : 12/11/2019 Cour d’appel d’Agen Audience p...,femme
4,Le : 12/11/2019 Cour d’appel d’Agen Audience p...,homme


In [43]:
train_df.head()

Unnamed: 0,ID,filename,texte,sexe,date_accident,date_consolidation
0,0,Agen_100515.txt,Le : 12/11/2019\n \n \nCour d’appel d’Agen \n ...,homme,1991-04-09,n.c.
1,1,Agen_1100752.txt,Le : 12/11/2019\n \n \nCour d’appel d’Agen \n ...,homme,2005-06-10,2010-01-19
2,2,Agen_1613.txt,Le : 12/11/2019\n \n \nCour d’appel d’Agen \n ...,femme,1997-09-26,n.c.
3,3,Agen_2118.txt,Le : 12/11/2019\n \n \nCour d’appel d’Agen \n ...,femme,1982-08-07,1982-11-07
4,4,Agen_21229.txt,Le : 12/11/2019\n \n \nCour d’appel d’Agen \n ...,homme,1996-11-26,n.c.


# Let's predict the sex of the victim!

## We will use a TF-IDF

In [44]:
vect = TfidfVectorizer(
  max_features=5000,
  min_df=20,
  stop_words=list(fr_stop),
  ngram_range=(2, 2),
  binary=True)

X = vect.fit_transform(train['texte'])
train["sexe"] = train["sexe"].replace({'homme': 0,
                                       "femme": 1,
                                       "n.c.": -1})
y = train['sexe']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(multi_class='multinomial', random_state=26).fit(X, y)

preds = clf.predict((X_test))

  train["sexe"] = train["sexe"].replace({'homme': 0,


# Now, we compute our metrics

In [45]:
print("accuracy:", round(accuracy_score(y_test, preds), 2) * 100, "%")
print("f1:", round(f1_score(y_test, preds, average='macro'), 2) * 100, "%")

accuracy: 92.0 %
f1: 89.0 %
