In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Read the data
new_df = pd.read_csv('/Users/camille/repo/Hetic/projet_gouv/scraping/export-experiences_lem.csv', sep='\t', encoding='latin-1')
new_df = new_df.drop(columns=['Unnamed: 0'])

# Split the data into train and test sets
train, test = train_test_split(new_df, random_state=42, test_size=0.30, shuffle=True)

# Preprocess the text data
train_text = train['Description']
test_text = test['Description']

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')

# Fit the vectorizer on the train text
vectorizer.fit(train_text)

# Transform the train and test text data
x_train = vectorizer.transform(train_text)
x_test = vectorizer.transform(test_text)

# Prepare the target variables
y_train = train.drop(labels=['Description'], axis=1)
y_test = test.drop(labels=['Description'], axis=1)

# Get the list of categories
categories = list(new_df.columns.values)
categories = categories[:-1]

# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
])

# Iterate over categories and train logistic regression models
for category in categories:
    print('Processing {} comments...'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # Calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

# Create a DataFrame to store the predicted values
predicted_df = pd.DataFrame(index=test.index)

# Iterate over categories and make predictions
for category in categories:
    # Train the logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # Make predictions on test data
    predictions = LogReg_pipeline.predict(x_test)
    
    # Add the predicted column to the DataFrame
    predicted_df[category + '_predicted'] = predictions

# Concatenate the test DataFrame (with original columns) and the predicted DataFrame
result_df = pd.concat([test, predicted_df], axis=1)
result_df = pd.merge(result_df, new_df[['Description']], left_index=True, right_index=True)
#appliquer sur le df positif 
df_pos = pd.read_csv('/Users/camille/repo/Hetic/projet_gouv/scraping/Cleaning_eda/all_data_pos.csv')

Processing pos_info comments...
Test accuracy is 0.9135451240714398


Processing neg_info comments...
Test accuracy is 0.7477477477477478


Processing pos_access comments...
Test accuracy is 0.8632843369685474


Processing neg_access comments...
Test accuracy is 0.77161371898214


Processing pos_relation comments...
Test accuracy is 0.9231863442389758


Processing neg_relation comments...
Test accuracy is 0.93266951161688


Processing pos_reactivite comments...
Test accuracy is 0.9353564090406196


Processing neg_reactivite comments...
Test accuracy is 0.8392603129445235


Processing pos_simplicite comments...
Test accuracy is 0.9151256519677572


Processing neg_simplicite comments...
Test accuracy is 0.6936936936936937




In [3]:
import pandas as pd
df_pos = pd.read_csv('/Users/camille/repo/Hetic/projet_gouv/scraping/Cleaning_eda/all_data_pos.csv')


In [4]:
df_pos

Unnamed: 0,rate,review_text,object_address,cleaned_text,sentiment,pos_reviews_terms
0,4,Toujours très bien reçu surtout celle avec les...,"5 Rue de Quimper, 68100 Mulhouse",bien cheveux violet,1,[]
1,5,"Bonjour, En bref, le personnel fait son travai...","5 Rue de Quimper, 68100 Mulhouse",bonjour bref personnel travail question soluti...,1,['problèm']
2,4,"Tout le monde est top (agents comme public), l...","5 Rue de Quimper, 68100 Mulhouse",monde agent public ascenseur,1,"['agent', 'public', 'ascenseur']"
3,5,Nouveau bâtiment avec de la place sur le parki...,"5 Rue de Quimper, 68100 Mulhouse",bâtiment place parking min arret,1,"['place', 'parking']"
4,5,Ma Future 2eme maison,"5 Rue de Quimper, 68100 Mulhouse",future 2eme maison,1,[]
...,...,...,...,...,...,...
3649,5,Je tiens à remercier Mme Morand Sophie qui a é...,"45 Av. Billaud Varenne, 17000 La Rochelle",mme morand sophie interlocuteur 16/11/22 jour ...,1,"['interlocuteur', 'rendez-vous', 'prise', 'cha..."
3650,5,"Faut essaie de remonter les étoile un peux, Ok...","45 Av. Billaud Varenne, 17000 La Rochelle",étoile chose mieux partie conseiller bien écou...,1,"['chose', 'bon', 'humain']"
3651,5,En vu des commentaire je suis obligée de laiss...,"45 Av. Billaud Varenne, 17000 La Rochelle",commentaire avis couseilliere mme.bou écoute r...,1,"['rapide', 'changement', 'tomber', 'pole', 'em..."
3652,3,j'ai eu soixante ans le 15 mars ; suis je obli...,"45 Av. Billaud Varenne, 17000 La Rochelle",soixant mars pointé,1,['pointer']


In [15]:
# Read the positive DataFrame
df_pos = pd.read_csv('/Users/camille/repo/Hetic/projet_gouv/scraping/Cleaning_eda/all_data_pos.csv')
df_pos = df_pos.dropna(subset=['cleaned_text'])
# Preprocess the text data
pos_text = df_pos['cleaned_text']

# Transform the positive text data
x_pos = vectorizer.transform(pos_text)

# Create a DataFrame to store the predicted values
predicted_pos_df = pd.DataFrame(index=df_pos.index)

# Iterate over categories and make predictions on positive data
for category in categories:
    # Train the logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # Make predictions on positive data
    predictions_pos = LogReg_pipeline.predict(x_pos)
    
    # Add the predicted column to the DataFrame
    predicted_pos_df[category + '_predicted'] = predictions_pos

# Concatenate the positive DataFrame (with original columns) and the predicted DataFrame
result_pos_df = pd.concat([df_pos, predicted_pos_df], axis=1)


In [17]:
jel = [result_pos_df['pos_info_predicted'] == 1]

In [18]:
jel

Unnamed: 0,rate,review_text,object_address,cleaned_text,sentiment,pos_reviews_terms,pos_info_predicted,neg_info_predicted,pos_access_predicted,neg_access_predicted,pos_relation_predicted,neg_relation_predicted,pos_reactivite_predicted,neg_reactivite_predicted,pos_simplicite_predicted,neg_simplicite_predicted
14,5,Très bon accueil. Très bon service.👍. Ma conse...,"11 Rue Pelée, 75011 Paris",bon accueil bon service. conseiller écoute que...,1,"['bon', 'bon', 'écout', 'précieux', 'conseil',...",1,0,1,0,1,0,0,0,0,0
47,5,"Très bon accueil, bon conseil, et efficace.","121 Bd de la Liberté, 59000 Lille",bon accueil bon conseil efficace,1,"['bon', 'bon', 'efficace']",1,0,1,0,1,0,0,0,0,0
74,3,j'ai toujours été très bien reçu,"30 Rue de Riegelsberg, 27140 Gisors",bien,1,[],1,0,1,0,1,0,0,0,1,0
82,3,Bien reçu et acceuillant,"29 Av. du Général de Gaulle, 52200 Langres",bien,1,[],1,0,1,0,1,0,0,0,1,0
91,5,Bon accueil et très bien renseigné merci,"40 Imp. des Lilas, 64340 Boucau",bon accueil bien,1,['bon'],1,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,4,Toujours était bien reçu,"Av. de la République, 59113 Seclin",bien,1,[],1,0,1,0,1,0,0,0,1,0
3598,4,Bonne accueil,"5 Rue des Dinandiers, 57070 Metz",bon accueil,1,['bon'],1,0,1,0,1,0,0,0,0,0
3615,4,Très bien situé. Juste la façade à améliorer,"11 Prom. du Belvédère, 77200 Torcy",bien façade,1,[],1,0,1,0,1,0,0,0,1,0
3637,5,Très bon accueil a l'agence Pôle Emploi de Ci...,"13 Rue Norbert Portejoie, 86400 Civray",bon accueil agence pôle civray bienveillance é...,1,"['bon', 'accueil', 'écout', 'réel', 'orientati...",1,0,1,0,1,0,0,0,0,0


In [34]:
df = pd.read_csv('/Users/camille/repo/Hetic/projet_gouv/scraping/predict_label/result_pos_df.csv')

In [35]:
df

Unnamed: 0,rate,review_text,object_address,cleaned_text,sentiment,pos_info_predicted,neg_info_predicted,neutre_info_predicted,unknown_info_predicted,pos_access_predicted,...,neutre_relation_predicted,unknown_relation_predicted,pos_reactivite_predicted,neg_reactivite_predicted,neutre_reactivite_predicted,unknown_reactivite_predicted,pos_simplicite_predicted,neg_simplicite_predicted,neutre_simplicite_predicted,unknown_simplicite_predicted
0,1,Pas de formation possible Pas de financement p...,"5 Rue de Quimper, 68100 Mulhouse",formation financement refus refus parcours inu...,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
1,1,"Aucun respect, j’explique pourtant bien mon so...","5 Rue de Quimper, 68100 Mulhouse",aucun respect pourtant bien souci faire rond p...,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
2,1,"Les fonctionnaires incompétents, arrogants et ...","5 Rue de Quimper, 68100 Mulhouse",fonctionnaire incompétent arrogant prétentieux...,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
3,1,"Aucun respect, j’explique pourtant bien mon so...","5 Rue de Quimper, 68100 Mulhouse",aucun respect pourtant bien souci air hautain ...,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
4,1,Si je pouvais ne pas mettre d'étoiles je n'en ...,"5 Rue de Quimper, 68100 Mulhouse",étoile mettrai conseiller indemnisation désagr...,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9357,3,j'ai eu soixante ans le 15 mars ; suis je obli...,"45 Av. Billaud Varenne, 17000 La Rochelle",soixant mars pointé,1,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
9358,3,Un pôle emploi comme un autre,"45 Av. Billaud Varenne, 17000 La Rochelle",pôle,1,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
9359,1,Incompétents.,"45 Av. Billaud Varenne, 17000 La Rochelle",incompétent,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
9360,1,Sourires en option,"45 Av. Billaud Varenne, 17000 La Rochelle",sourire option,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1


In [22]:
percentage_pos = result_pos_df['pos_info_predicted'].sum() / len(result_pos_df['pos_info_predicted'])*100
percentage_neg_ingo = result_pos_df['neg_info_predicted'].sum() / len(result_pos_df['neg_info_predicted'])*100

In [30]:
result_pos_df.columns

Index(['rate', 'review_text', 'object_address', 'cleaned_text', 'sentiment',
       'pos_reviews_terms', 'pos_info_predicted', 'neg_info_predicted',
       'pos_access_predicted', 'neg_access_predicted',
       'pos_relation_predicted', 'neg_relation_predicted',
       'pos_reactivite_predicted', 'neg_reactivite_predicted',
       'pos_simplicite_predicted', 'neg_simplicite_predicted'],
      dtype='object')

In [31]:
percentage_neg_info_predicted = result_pos_df['neg_info_predicted'].sum() / len(result_pos_df['neg_info_predicted'])*100
percentage_pos_info_predicted = result_pos_df['pos_info_predicted'].sum() / len(result_pos_df['pos_info_predicted'])*100
percentage_pos_access_predicted = result_pos_df['pos_access_predicted'].sum() / len(result_pos_df['pos_access_predicted'])*100
percentage_neg_access_predicted = result_pos_df['neg_access_predicted'].sum() / len(result_pos_df['neg_access_predicted'])*100
perceentage_pos_relation_predicted = result_pos_df['pos_relation_predicted'].sum() / len(result_pos_df['pos_relation_predicted'])*100
percentage_neg_relation_predicted = result_pos_df['neg_relation_predicted'].sum() / len(result_pos_df['neg_relation_predicted'])*100
percentage_pos_reactivite_predicted = result_pos_df['pos_reactivite_predicted'].sum() / len(result_pos_df['pos_reactivite_predicted'])*100
percentage_neg_reactivite_predicted = result_pos_df['neg_reactivite_predicted'].sum() / len(result_pos_df['neg_reactivite_predicted'])*100
percentage_pos_simplicite_predicted = result_pos_df['pos_simplicite_predicted'].sum() / len(result_pos_df['pos_simplicite_predicted'])*100
percentage_neg_simplicite_predicted = result_pos_df['neg_simplicite_predicted'].sum() / len(result_pos_df['neg_simplicite_predicted'])*100

In [33]:
percentage_neg_info_predicted = result_pos_df['neg_info_predicted'].sum() / len(result_pos_df['neg_info_predicted']) * 100
percentage_pos_info_predicted = result_pos_df['pos_info_predicted'].sum() / len(result_pos_df['pos_info_predicted']) * 100
percentage_pos_access_predicted = result_pos_df['pos_access_predicted'].sum() / len(result_pos_df['pos_access_predicted']) * 100
percentage_neg_access_predicted = result_pos_df['neg_access_predicted'].sum() / len(result_pos_df['neg_access_predicted']) * 100
percentage_pos_relation_predicted = result_pos_df['pos_relation_predicted'].sum() / len(result_pos_df['pos_relation_predicted']) * 100
percentage_neg_relation_predicted = result_pos_df['neg_relation_predicted'].sum() / len(result_pos_df['neg_relation_predicted']) * 100
percentage_pos_reactivite_predicted = result_pos_df['pos_reactivite_predicted'].sum() / len(result_pos_df['pos_reactivite_predicted']) * 100
percentage_neg_reactivite_predicted = result_pos_df['neg_reactivite_predicted'].sum() / len(result_pos_df['neg_reactivite_predicted']) * 100
percentage_pos_simplicite_predicted = result_pos_df['pos_simplicite_predicted'].sum() / len(result_pos_df['pos_simplicite_predicted']) * 100
percentage_neg_simplicite_predicted = result_pos_df['neg_simplicite_predicted'].sum() / len(result_pos_df['neg_simplicite_predicted']) * 100

print('Percentage neg_info_predicted:', percentage_neg_info_predicted)
print('Percentage pos_info_predicted:', percentage_pos_info_predicted)
print('Percentage pos_access_predicted:', percentage_pos_access_predicted)
print('Percentage neg_access_predicted:', percentage_neg_access_predicted)
print('Percentage pos_relation_predicted:', percentage_pos_relation_predicted)
print('Percentage neg_relation_predicted:', percentage_neg_relation_predicted)
print('Percentage pos_reactivite_predicted:', percentage_pos_reactivite_predicted)
print('Percentage neg_reactivite_predicted:', percentage_neg_reactivite_predicted)
print('Percentage pos_simplicite_predicted:', percentage_pos_simplicite_predicted)
print('Percentage neg_simplicite_predicted:', percentage_neg_simplicite_predicted)


Percentage neg_info_predicted: 0.11220196353436186
Percentage pos_info_predicted: 6.339410939691445
Percentage pos_access_predicted: 16.016830294530155
Percentage neg_access_predicted: 14.866760168302944
Percentage pos_relation_predicted: 19.85974754558205
Percentage neg_relation_predicted: 0.0
Percentage pos_reactivite_predicted: 1.514726507713885
Percentage neg_reactivite_predicted: 0.42075736325385693
Percentage pos_simplicite_predicted: 4.179523141654979
Percentage neg_simplicite_predicted: 1.2903225806451613
