# NAIVE BAYES


In [1]:
# On importe les librairies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,CategoricalNB

In [4]:
# On importe le dataset
dataset = pd.read_csv("datasets/titanic.csv")
dataset.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


## Remarques sur le dataset

Certaines données ne servent à rien :
PassendgerId, Name,SibSp,Parch Ticket Cabin,Embarked.

La donnée Sex est à transformer car elle est catégorique

Il faut vérifier si il n'y a pas des valeurs manquantes.

## Préparations des données ( data preprocessing )


### Trier les données

In [5]:
# On dégage les données qui ne servent à rien
dataset.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
dataset.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


### Séparer les données ( inputs ) des cibles (targets)

In [6]:
# On sépare notre dataframe en deux les inputs et les targets ( survived )
X = dataset.drop('Survived',axis='columns')
Y = dataset.Survived

### Traiter les variables catégoriques

In [7]:
# On modifier la colonne Sex 
X['Sex']=X['Sex'].apply(lambda x: 1 if x=='female' else 0)


In [8]:
X.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,0,22.0,7.25
1,1,1,38.0,71.2833
2,3,1,26.0,7.925
3,1,1,35.0,53.1
4,3,0,35.0,8.05


### traiter les données manquantes

In [9]:
# Identifier les colonnes ou il manque des données
X.columns[X.isna().any()]

Index(['Age'], dtype='object')

In [10]:
X.describe()

Unnamed: 0,Pclass,Sex,Age,Fare
count,891.0,891.0,714.0,891.0
mean,2.308642,0.352413,29.699118,32.204208
std,0.836071,0.47799,14.526497,49.693429
min,1.0,0.0,0.42,0.0
25%,2.0,0.0,20.125,7.9104
50%,3.0,0.0,28.0,14.4542
75%,3.0,1.0,38.0,31.0
max,3.0,1.0,80.0,512.3292


In [11]:
X.fillna(X.mean(), inplace=True)
X.describe()

Unnamed: 0,Pclass,Sex,Age,Fare
count,891.0,891.0,891.0,891.0
mean,2.308642,0.352413,29.699118,32.204208
std,0.836071,0.47799,13.002015,49.693429
min,1.0,0.0,0.42,0.0
25%,2.0,0.0,22.0,7.9104
50%,3.0,0.0,29.699118,14.4542
75%,3.0,1.0,35.0,31.0
max,3.0,1.0,80.0,512.3292


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)

In [13]:
model = CategoricalNB()

In [14]:
model.fit(X_train,y_train)

CategoricalNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
model.score(X_test,y_test)

0.7653631284916201

In [16]:
model.score(X_train,y_train)

0.827247191011236

## NAIVE BAYES + texte


In [20]:
# On importe le dataset
data = pd.read_csv("datasets/spam.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
# On importe CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


<img src="images/vectorchart.png">

In [23]:
data.groupby('Category').describe()


Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [24]:
data['spam']=data['Category'].apply(lambda x: 1 if x=='spam' else 0)
data.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [25]:
X_train, X_test, y_train, y_test = train_test_split(data.Message,data.spam)

In [26]:
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [27]:
model = MultinomialNB()
model.fit(X_train_count,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
model.score(X_train_count,y_train)

0.9935391241923905

In [29]:

X_test_count = v.transform(X_test)
model.score(X_test_count, y_test)

0.9892318736539842

## Pipeline

Le pipeline va venir transformer nos données et les passer au modèle

In [99]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [100]:

clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [101]:
clf.score(X_test,y_test)

0.9834888729361091