### Import des modules 

In [336]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

### chargement des données

In [338]:
training = pd.read_csv(r"C:\Users\Papa-SEYE\Documents\kaggle\train.csv")
testing = pd.read_csv(r"C:\Users\Papa-SEYE\Documents\kaggle\test.csv")

In [339]:
training.head() #Affichage des 5 premières lignes

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Suppression des variables qui ne sont pas significatives pour notre modèle

In [340]:
training.drop(["PassengerId", "Ticket", "Cabin", "Embarked", "Name"], axis=1, inplace=True)

In [341]:
training.head() #Après suppression de quelques variable

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


### transformation de la variable sex en numerique

In [342]:
training = pd.get_dummies(training, columns=['Sex'], prefix='Sex', drop_first=True) 
#Sklearn accepte seulement les données numériques

In [154]:
training.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male
0,0,3,22.0,1,0,7.25,1
1,1,1,38.0,1,0,71.2833,0
2,1,3,26.0,0,0,7.925,0
3,1,1,35.0,1,0,53.1,0
4,0,3,35.0,0,0,8.05,1


In [155]:
training.dtypes

Survived      int64
Pclass        int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Sex_male      uint8
dtype: object

### Conversion de la variable Sex_male en entier

In [343]:
training['Sex_male'] = training['Sex_male'].astype('int')

In [344]:
training.dtypes

Survived      int64
Pclass        int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Sex_male      int32
dtype: object

In [157]:
mean = training['Age'].mean() #Calcul de la moyen pour la variable age

In [345]:
training['Age'] = training['Age'].fillna(mean) #Remplacement des valeurs manquantes par la moyenne pour la variable
#Age

In [346]:
training.isnull().sum()

Survived    0
Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Sex_male    0
dtype: int64

In [218]:
model = LogisticRegression(solver='lbfgs') #Initialisation du modèle
X = training.drop(["Survived", "Pclass"], axis=1) #Variable dépendante
y = training["Survived"] #Variable indépendante 

In [219]:
training.count()

Survived    891
Pclass      891
Age         891
SibSp       891
Parch       891
Fare        891
Sex_male    891
dtype: int64

In [220]:
y.dtype

dtype('int64')

In [198]:
testing.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [353]:
X.columns #Affichage des colonnes de la variable predictives

Index(['Age', 'SibSp', 'Parch', 'Fare', 'Sex_male'], dtype='object')

### Suppression des variables non-utilisées par notre modèle sur les données d'entrainement

In [200]:
testing1 = testing.drop(['PassengerId', "Pclass", 'Embarked', 'Cabin', 'Ticket', 'Name'], axis=1) 

### pandas.get_dummies() method

In [201]:
testing1 = pd.get_dummies(testing1, columns=["Sex"], prefix='Sex', drop_first=True) 

In [202]:
testing1.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male
0,34.5,0,0,7.8292,1
1,47.0,1,0,7.0,0
2,62.0,0,0,9.6875,1
3,27.0,0,0,8.6625,1
4,22.0,1,1,12.2875,0


In [203]:
testing1.count() #Vérification des valeurs manquantes

Age         332
SibSp       418
Parch       418
Fare        417
Sex_male    418
dtype: int64

In [204]:
testing1.dtypes # verification des types des variables

Age         float64
SibSp         int64
Parch         int64
Fare        float64
Sex_male      uint8
dtype: object

In [354]:
testing1["Sex_male"] = testing1["Sex_male"].astype('int') #Conversion de la variable 'Sex_male' en intier

In [355]:
moy = testing1.Fare.mean() #Calcul de la moyen pour la variable 'Fare'

In [356]:
testing1['Fare']= testing['Fare'].fillna(moy) #Imputation des valeurs valeurs manquantes 

In [357]:
moyenne = testing1.Age.mean() #Calcul de la moyenne pour la variable 'Age'

In [358]:
testing1['Age']= testing['Age'].fillna(moyenne)  #Imputation des valeurs valeurs manquantes pour la variables 'Age'

In [359]:
testing1.count()

Age         418
SibSp       418
Parch       418
Fare        418
Sex_male    418
dtype: int64

In [216]:
X.columns #Affichage des colonnes 

Index(['Age', 'SibSp', 'Parch', 'Fare', 'Sex_male'], dtype='object')

In [361]:
len(X.columns) #le nombre de colonnes des données d'entrainement 

5

In [360]:
len(testing1.columns) #le nombre de colonnes des données de test

5

In [363]:
model.fit(X,y) #Ajustement du modèle

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [364]:
y_pred = model.predict(testing1) #faire des prédictions avec la method prédict()

In [365]:
model.score(X,y) #Calcul du score du model

0.7901234567901234

In [254]:
y_pred = pd.DataFrame(y_pred, columns=['prediction']) #Transformation de la variable y_pread en dataframe

In [260]:
y_pred.head()

Unnamed: 0,prediction
0,0
1,1
2,0
3,0
4,1


In [241]:
testing['PassengerId'].count()

418

In [268]:
prediction = pd.DataFrame(testing['PassengerId'])

In [273]:
prediction['Survived'] = y_pred['prediction']

In [274]:
prediction.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [366]:
prediction.to_csv(r'C:\Users\Papa-SEYE\Documents\kaggle\predictions.csv',index=False)
#Enregistrement du dataframe prediction en fichier csv

### Modele 2 avec la methode GridSearchCV

In [286]:
param_gri = {'n_neighbors': np.arange(1, 50)} # Grid de paramètres

In [287]:
knn = KNeighborsClassifier() #Initialisation du modele des K_plus proches voisins

In [302]:
knn_cv = GridSearchCV(knn, param_grid, cv=6) #Initialisation du modele de GridSearchCv

In [303]:
knn_cv.fit(X,y)

GridSearchCV(cv=6, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [367]:
y_pred2 = knn_cv.predict(testing1) #Faire des prédictions

In [368]:
knn_cv.best_score_ #Calcul des meuilleur score

0.7115600448933782

In [369]:
knn_cv.best_params_ #obtention des meilleurs parametres

{'n_neighbors': 16}

In [379]:
knn = KNeighborsClassifier(n_neighbors=16) 

In [383]:
knn.fit(X,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=16, p=2,
           weights='uniform')

In [384]:
knn.score(X,y) #Calcul du score de notre modèle

0.7418630751964085

In [385]:
y_predict = knn.predict(testing1) #Prediction 

In [332]:
y_predict = pd.DataFrame(y_predict, columns=['prediction']) #Transformation de y_predict en dataframe

In [333]:
predictions = pd.DataFrame(testing['PassengerId'])

In [334]:
predictions['Survived'] = y_predict['prediction']

In [335]:
predictions.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,0


In [327]:
predictions.to_csv(r'C:\Users\Papa-SEYE\Documents\kaggle\predictions2.csv',index=False)