On the given dataset perform the following tasks :
1. Clean the data
2. Normalize any required columns
3. Encode the categorical data [ use label encoding only for simplicity ]
4. Divide the data into train test split after separating the features from label
5. Train a decision tree [ Sk Learn ] and calculate its score using score() [ find in decision tree documentation of Sk Learn ]


In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [183]:
import pandas as pd       
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [184]:
path = '/content/drive/MyDrive/Data/data.csv'
df = pd.read_csv(path) 

In [185]:
df.head()

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,survived
0,1216,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q,13.0,,,1
1,699,3,"Cacic, Mr. Luka",male,38.0,0,0,315089,8.6625,,S,,,Croatia,0
2,1267,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",female,30.0,1,1,345773,24.15,,S,,,,0
3,449,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,29105,23.0,,S,4.0,,"Cornwall / Akron, OH",1
4,576,2,"Veal, Mr. James",male,40.0,0,0,28221,13.0,,S,,,"Barre, Co Washington, VT",0


In [33]:
df.describe()

Unnamed: 0,passenger_id,pclass,age,sibsp,parch,fare,body,survived
count,850.0,850.0,676.0,850.0,850.0,849.0,73.0,850.0
mean,662.816471,2.32,29.519847,0.522353,0.382353,34.012701,165.821918,0.368235
std,380.751936,0.83853,14.562243,1.112132,0.879511,53.705779,99.068487,0.48261
min,1.0,1.0,0.1667,0.0,0.0,0.0,4.0,0.0
25%,332.25,2.0,20.0,0.0,0.0,7.8958,75.0,0.0
50%,676.5,3.0,28.0,0.0,0.0,14.1083,166.0,0.0
75%,992.25,3.0,37.0,1.0,0.0,31.0,260.0,1.0
max,1307.0,3.0,80.0,8.0,9.0,512.3292,328.0,1.0


In [186]:
df.isnull().sum()

passenger_id      0
pclass            0
name              0
sex               0
age             174
sibsp             0
parch             0
ticket            0
fare              1
cabin           659
embarked          1
boat            542
body            777
home.dest       386
survived          0
dtype: int64

In [187]:
df.drop(['name','ticket','cabin','boat','body','home.dest','fare'],axis=1,inplace=True)
df

Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,embarked,survived
0,1216,3,female,,0,0,Q,1
1,699,3,male,38.0,0,0,S,0
2,1267,3,female,30.0,1,1,S,0
3,449,2,female,54.0,1,3,S,1
4,576,2,male,40.0,0,0,S,0
...,...,...,...,...,...,...,...,...
845,158,1,male,55.0,0,0,S,0
846,174,1,male,58.0,0,0,C,0
847,467,2,female,24.0,1,0,S,1
848,1112,3,female,3.0,1,1,S,0


In [188]:
label_encoder =LabelEncoder()
df['sex']= label_encoder.fit_transform(df['sex'])
df

Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,embarked,survived
0,1216,3,0,,0,0,Q,1
1,699,3,1,38.0,0,0,S,0
2,1267,3,0,30.0,1,1,S,0
3,449,2,0,54.0,1,3,S,1
4,576,2,1,40.0,0,0,S,0
...,...,...,...,...,...,...,...,...
845,158,1,1,55.0,0,0,S,0
846,174,1,1,58.0,0,0,C,0
847,467,2,0,24.0,1,0,S,1
848,1112,3,0,3.0,1,1,S,0


In [189]:
df.dropna(inplace=True)
df

Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,embarked,survived
1,699,3,1,38.0,0,0,S,0
2,1267,3,0,30.0,1,1,S,0
3,449,2,0,54.0,1,3,S,1
4,576,2,1,40.0,0,0,S,0
5,1083,3,1,28.0,0,0,S,0
...,...,...,...,...,...,...,...,...
845,158,1,1,55.0,0,0,S,0
846,174,1,1,58.0,0,0,C,0
847,467,2,0,24.0,1,0,S,1
848,1112,3,0,3.0,1,1,S,0


In [190]:
df=pd.get_dummies(df,drop_first=True)
df

Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,survived,embarked_Q,embarked_S
1,699,3,1,38.0,0,0,0,0,1
2,1267,3,0,30.0,1,1,0,0,1
3,449,2,0,54.0,1,3,1,0,1
4,576,2,1,40.0,0,0,0,0,1
5,1083,3,1,28.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
845,158,1,1,55.0,0,0,0,0,1
846,174,1,1,58.0,0,0,0,0,0
847,467,2,0,24.0,1,0,1,0,1
848,1112,3,0,3.0,1,1,0,0,1


In [191]:
x=df.iloc[:,:-1].values
x.shape

(675, 8)

In [192]:
y=df.iloc[:,-1].values
y.shape

(675,)

In [193]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25 , random_state=5)

In [194]:
x_train.shape

(506, 8)

In [195]:
x_test.shape

(169, 8)

In [196]:
y_train.shape

(506,)

In [197]:
y_test.shape

(169,)

In [198]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [199]:
y_pred = lr.predict(x_test)

In [200]:
y_test

array([1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1], dtype=uint8)

In [201]:
y_pred

array([1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1], dtype=uint8)

In [202]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [203]:
score = clf.score(x_test, y_test)
score

0.7455621301775148