In [1]:
import calendar
import numpy as np
import pandas as pd
import pickle
import string
import re
import warnings
warnings.filterwarnings("ignore")

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import roc_curve, auc, confusion_matrix

# text mining
from unidecode import unidecode
import nltk
from nltk.stem.snowball import SnowballStemmer  # Snowball stemmer was chosen in favor of Porter Stemmer which is a bit more aggressive and tends to remove too much from a word
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from langdetect import detect
nltk.download("punkt")
nltk.download("stopwords")
STOPWORDS_FR = stopwords.words("french")

# data viz
from matplotlib import pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sawal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sawal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
STEMMER_FR = SnowballStemmer(language='french')
STEMMER_EN = SnowballStemmer(language='english')

# Exercice 1

In [4]:
def pipeline_for(file):
    
    """This function parses a text file by using loop on each lines of the original file.
    
     Arguments:
      file {str}  : path of txt file.
     
    Returns:
      pd.DataFrame - Output for exercie 1

      
    """

    with open(file, 'r') as f:
        lines = [line.strip() for line in f.readlines()]
    
    # store all the id as keys and theirs values
    keys = []
    vals = []
    for line in lines:
        if line.startswith("id"):
            keys.append(line)
        else:
            vals.append(line)
            
    # match the keys and values with dict
    d = {}
    for i in lines:
        tmp = []
        if i in keys:
            tmp.append(lines[lines.index(i)+1])
            n= 2
            if lines.index(i) < len(lines)-2:
                while lines[lines.index(i)+n] not in keys:
                    tmp.append(lines[lines.index(i)+n-1])
                    n=n+1
            d[i] = tmp
    
    # transpose the dicts in a datafram
    df = pd.DataFrame(
                    pd.DataFrame(dict([(key, pd.Series(val)) for key,val in d.items()])).transpose().stack()
                ).reset_index()

    df.columns = ["id", "level_1", "value"] 
    # proccess dataframe
    df[["vars_values", "values"]] = df.value.str.split(",", expand=True)
    df[["vars_id", "id"]] = df.id.str.split(",", expand=True)
    df.drop(["vars_id", "value"], axis=1, inplace=True)
    
    # pivot the data
    df["index"] = df.index
    df = df.pivot(index=["index", "id"], columns="vars_values", values="values")
    df = df.reset_index().rename_axis(None, axis=1).drop("index", axis=1)
    
    # customize output table
    cols = [col for col in df.columns]
    last_col= cols[len(cols)-1]
    for i in range(1, int(last_col.replace("v","")) +1):
        if f"v{i}" not in cols:
            df[f"v{i}"] = np.nan
    cols = [col for col in df.columns]
    # sort v1 v2 v3 v4 
    cols.sort()        
    return df.reindex(columns=cols)


In [1]:
def pipeline_noloop_lines(file, test=False):
    
    """This function parses a text file WITHOUT using loop on each lines of the original file.We use vectorization and arrays.
    
     Arguments:
      file {str}  : path of txt file.
     
    Returns:
      pd.DataFrame - Output for exercie 1

    """
    if test:
        df = pd.read_csv(file, header=None, sep='\n,', engine='python')
    else:
        df = pd.read_csv(file, header=None)

    df.columns= ["key", "value"]
    # find list of index id
    list_index = df[df.key == "id"].index.to_list()
    list_index_shift = list_index[1:] + [df.shape[0]]
    # find how many times is repeated an id
    list_repetition_value = np.array(list_index_shift) - np.array(list_index) - 1
    # create arrays of list  id
    array_list_id = df[df.key == "id"].value.apply(lambda x: [x]).values
    # create list of id for the output table
    list_id = (list_repetition_value*array_list_id).sum()
    #pivot initial table
    df_final = df.pivot(columns="key", values="value")
    df_final["id"] = df_final.id.shift()
    # drop rows with nan
    df_final = df_final.dropna(how="all")
    df_final = df_final.rename_axis(None, axis=1).reset_index(drop=True)
    df_final["id"] = list_id

    # customize output table : add features v\d that are missing
    cols = [col for col in df_final.columns]
    for i in range(1, int(cols[len(cols)-1].replace("v","")) +1):
        if f"v{i}" not in df_final.columns:
            df_final[f"v{i}"] = np.nan
    cols = [col for col in df_final.columns]
    # sort v1 v2 v3 v4 
    cols.sort()        
    return df_final.reindex(columns=cols)

# Exercice 2

In [3]:
# find the last day of month
def last_day_month(DATE):
    """This function returns the last date of month.
    
     Arguments:
      DATE {Timestamp} 
     
    Returns:
      Timestamp - last date of month. 
      --------------------------------
      Example : 2021-11-30
      
      """
      
    days = calendar.monthrange(DATE.year, DATE.month)[1]
    return DATE.replace(day=days)

In [4]:
def baseline(df):
    
    """This function return the sequences of start date and end date by id
        
    Arguments:
      df {pd.DataFrame} -- raw dataframe - parsed dates columns
     
    Returns:
      pd.DataFrame
      
        Exemple
        |id  |mois        |end         |
        |----|------------|------------|
        |1   |2017-11-01  |2017-11-30  |
        |1   |2017-12-01  |2017-12-31  |
        |1   |2018-01-01  |2018-01-31  |
        |1   |2018-02-01  |2018-02-28  |
        |2   |2017-11-01  |2017-11-30  |
        |2   |2017-12-01  |2017-12-31  |
        |2   |2018-01-01  |2018-01-31  |
        |2   |2018-02-01  |2018-02-28  |
      

    """

    # get min of date and max of date 
    ref = df.agg({'date_debut':'min', 'date_fin':'max'})
    # define start of months
    low = ref[0].replace(day=1)
    up = ref[1].replace(day=1)
    # create a liste of months from min to max
    liste = []
    d={}
    while low!=up:
        liste.append(low)
        low = low + relativedelta(months=+1)
    liste.append(up)
    liste
    # attach id id to the list of months
    for id in df.id:
        d[id] = liste
    # transform the dict to a dataframe
    data = pd.DataFrame(
                    pd.DataFrame(dict([(key, pd.Series(val)) for key,val in d.items()])).transpose().stack()
                ).reset_index()

    data = data.drop("level_1", axis=1)\
               .rename(columns={'level_0':'id', 0:'mois'})
    
    data["end"] = data["mois"].apply(last_day_month)
    return data

In [5]:
def dates_matching(df_baseline, df):
    
      
    """This function return the NB_arret by id and by month
        
    Arguments:
      df_baseline {pd.DataFrame} -- sequences of dates by id 
      df {pd.DataFrame} -- raw dataframe - parsed dates columns
     
    Returns:
      pd.DataFrame
      
    """
    # merge baseline and original table
    df_merge = pd.merge(df_baseline, df, on="id", how='left')     
    # limit date_fin by month 
    cond2 = (df_merge['mois'] < df_merge['date_debut']) & (df_merge['date_fin'] > df_merge['end'])
    df_merge.loc[cond2, 'date_fin'] =   df_merge['end']
    #df_merge.drop_duplicates(subset=None, keep="first", inplace=True)

    # Therefore, specify the beginning after limitation
    cond3 =  df_merge['date_fin'].dt.month - df_merge['date_debut'].dt.month !=0
    df_merge.loc[cond3, 'date_debut'] =   df_merge['mois']
    #df_merge.drop_duplicates(subset=None, keep="first", inplace=True)

    # Assign NA to matching datef_fin ad date_debut
    cond4 = (df_merge['date_fin'] < df_merge['date_debut']) |  (df_merge['date_fin'] < df_merge['mois'])
    df_merge.loc[cond4, ["date_debut", "date_fin"]] = None

    # drop non overlapping months 
    cond5 = df_merge[(df_merge['date_debut'] == df_merge['mois']) &  (df_merge['date_fin'] == df_merge['end'])].index
    df_merge.drop(cond5 , inplace=True)

    # keeep only last end of month
    df_merge = df_merge.sort_values(by=["id", "mois", "end", "date_debut", "date_fin"], ascending=True)\
                       .drop_duplicates(subset=["id","mois","end"], keep="last")

    # compute NB_ arret
    df_merge['NB_arret'] =  ((df_merge['date_fin'] - df_merge['date_debut']).dt.days +1).fillna(0).astype(int)

    df_merge = df_merge.reset_index(drop=True)\
                       .drop("end", axis=1)\
                       .rename({'date_debut':'date_debut_mois', 'date_fin':'date_fins_mois'})
    return df_merge

# Exercice 3

In [6]:
def make_text_prep(row, 
                        word_blacklist,
                        regex_replace, 
                        colonne:str =None) :
    """
    This function treats the input string by going through the following steps:
        - Language detection
        - Remove punctuation and special characters
        - Tekenization
        - Stop-word removal
        - Stemming
        - ASCII folding.
    
    Args:
      
        row (str) : The input string to be treated.
        word_blacklist (list[str]) : additional stop-word
        regex_replace (Dict[str, str]) ::characters to remove
         colonne :  name of the colonne to prepare.  Default = None,If None, the input is a string.
      
    Returns:
      str : The treated version of the string. 
    """
    

    if colonne == None :
      s = str(row)
    else :
      s=row[colonne]
    
    # in the default case use the English stop-words and stemmer
    stemmer = STEMMER_FR
    stop_words =  word_blacklist + STOPWORDS_FR

    
    # convert to lowercase, just to be sure :)
    s = s.lower()
    
    # check if the language isn't French and switch to the English:
    # No need because all data are in french
    
    """s_lang = detect(s)
    if s_lang[0]!="fr":
       stemmer = STEMMER_EN
       stop_words = word_blacklist
    """


    # remove punctuation
    s_clean = s.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))

    # tokenize the string into words
    s_tokens = word_tokenize(s_clean)

    # remove the stop-word tokens
    s_tokens_no_stop = [word for word in s_tokens if word not in stop_words]
    
    # join the stemmed tokens together and ASCII fold
    s_tokens_stemmed = [stemmer.stem(word) for word in s_tokens_no_stop]
    s_ascii = unidecode(" ".join(s_tokens_stemmed))
    
    for regex, replace in regex_replace.items():
      s_ascii = re.sub(regex, replace, s_ascii)

    return(s_ascii.strip())

In [7]:
 def experimenter(data, classifiers):
        
    """A function to run multiple algorithms in succession with their default parameters 
    without hyperparameter tuning
     Arguments:
      data {pd.DataFrame} -- input dataframe
      classifiers {List} --lists of classes - algorithms .Example :[LogisticRegression(),  DecisionTreeClassifier()]
     
    Returns:
      print AUC in the test set and the training set.
      
    """
    for classifier in classifiers:
        # split data
        X_train, X_test, y_train, y_test = train_test_split(data.description_clean, data.target, test_size = 0.2, random_state=202209)
        
        # vectorization
        vectorizer = CountVectorizer(ngram_range=(1,1), min_df=3, max_df=0.9, strip_accents='unicode', analyzer="word")
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        
        # entrainement
        model = classifier
        model = model.fit(X_train, y_train)

        #calcul de la prédiction sur l'échantillon test

        pred_proba_test = model.predict_proba(X_test)[:,1]
        pred_proba_train = model.predict_proba(X_train)[:,1]

        # calculer de metric
        print(f"----------classifier : {classifier} ----------")
        print("AUC TRAIN:", roc_auc_score(y_train, pred_proba_train))
        print("AUC TEST:", roc_auc_score(y_test,pred_proba_test))

In [8]:
def my_confusion_matrix(y_test, y_predicted):
    
    """
    This function return a friendly confusion matrix - easy to read
    
    Args:
      
        y_test {pd.Series of int} : target /label 
        y_predicted {pd.Series of int} : predicted label
    Returns:
      matplotlib.pyplot.figure : a table. 
    """
    
    cm = confusion_matrix(y_test, y_predicted)
    tn, fp, fn, tp = confusion_matrix(y_test, y_predicted).astype(int).ravel()
    print("Accuracy:", round((tp + tn)/(tp+tn+fp+fn),2))
    print("Recall:", round(tp /(tp+fn),2))
    plt.figure(figsize=(5,5))
    plt.clf()
    plt.imshow(cm, interpolation='nearest',cmap=plt.cm.Wistia)
    classNames = ['Negative','Positive']
    plt.title('Matrice de confusion')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=45)
    plt.yticks(tick_marks, classNames)
    s = [['TN','FP'], ['FN', 'TP']]

    for i in range(2):
      for j in range(2):
          plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
    plt.show()

In [9]:
def prediction_individuelle(input):
    
    """
    This function predicts the class given  a raw input description
    
    Args:
      
        input (str) : The input string 
    Returns:
      Bolean : description class. 
      
    """
    with open("./INTERMED/optimal_thr.sav","rb") as f:
        optimal_thr =pickle.load(f)
        
    with open("./INTERMED/random_forest_model.sav","rb") as f:
        model = pickle.load(f)
    
    with open("./INTERMED/vectorizer.sav","rb") as f:
        vectorizer = pickle.load(f)
    input = pd.DataFrame( [[input]],columns=[ 'Description'])

    input = vectorizer.transform(input['Description'])
 
    if  model.predict_proba(input)[:,1] <optimal_thr:
        return False
    else :
        return True

In [10]:
def prediction_file(path="./INPUT/act_couv.csv"):
    
    """
    This function predicts the class given a overall csv file  
    
    Args:
        path (str) : path of csv file
    Returns:
      file : csv
    """   
    data = pd.read_csv(path, sep=";")
    data["class_predite"] = data.description.apply(prediction_individuelle)
    data.to_csv("./OUTPUT/ct_couv_prediction.csv")