In [1]:
# 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Importing models for evaluation
from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.model_selecti

In [2]:
df = pd.read_csv("data/space_com/train.csv")
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df_id = df.PassengerId
df_name = df.Name
df.drop(["PassengerId", "Name"], axis = 1, inplace = True)

In [4]:
df.head(1)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False


In [5]:
df.shape, df.size, df.ndim

((8693, 12), 104316, 2)

In [6]:
df.isna().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

In [7]:
df.dtypes

HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported        bool
dtype: object

In [8]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


# Converting data to numeric and filling null values 

### Numeric data

In [9]:
# Listing numeric data 
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isna(content).sum():
            print(label)

Age
RoomService
FoodCourt
ShoppingMall
Spa
VRDeck


In [10]:
# Filling numeric data null values with meadian
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isna(content).sum():
            df[label] = content.fillna(content.median);

In [11]:
df.isna().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age               0
VIP             203
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
dtype: int64

### Non numeric data

In [12]:
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        df[label] = content.astype("category").cat.as_ordered()

In [13]:
df.dtypes

HomePlanet      category
CryoSleep       category
Cabin           category
Destination     category
Age             category
VIP             category
RoomService     category
FoodCourt       category
ShoppingMall    category
Spa             category
VRDeck          category
Transported         bool
dtype: object

In [14]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [15]:
df.Transported = pd.DataFrame(pd.Categorical(df.Transported).codes)

In [16]:
# Conging categorical data to nuerical 
for label, content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        df[label] = pd.Categorical(content).codes+1

In [17]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,2,1,150,3,1,1,1,1,1,1,1,0
1,1,1,2185,3,2,1,2,2,2,2,2,1
2,2,1,2,3,3,2,3,3,1,3,3,0
3,2,1,2,3,4,1,1,4,3,4,4,0
4,1,1,2187,3,5,1,4,5,4,5,5,1


# Modeling

In [18]:
X, y = df.drop("Transported", axis = 1), df["Transported"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
df.dtypes

HomePlanet       int8
CryoSleep        int8
Cabin           int16
Destination      int8
Age              int8
VIP              int8
RoomService     int16
FoodCourt       int16
ShoppingMall    int16
Spa             int16
VRDeck          int16
Transported      int8
dtype: object

In [20]:
clf = RandomForestClassifier()

clf.fit(X_train, y_train)

clf.score(X_test, y_test)

0.7630822311673375

In [21]:
cross_val_score(clf, X, y).mean()

0.7313964283799996

In [22]:
# Creating a function to compare models
models = {
    "RandomForestClassifier" : RandomForestClassifier(),
    "KNeighborsClassifier" : KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression()
 }

In [23]:
# 
def eval_models(models, X, y):
    results = {}
    for name, model in models.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model.fit(X_train, y_train)
        results[name] = model.score(X_test, y_test)
    return results


In [24]:
eval_models(models, X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'RandomForestClassifier': 0.7567567567567568,
 'KNeighborsClassifier': 0.7176538240368028,
 'LogisticRegression': 0.723404255319149}

## Converting test data to numeric

In [48]:
test = pd.read_csv("data/space_com/test.csv")
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [49]:
test_id = test.PassengerId
test.drop(["PassengerId", "Name"], axis = 1, inplace = True)

In [50]:
for label, content in test.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isna(content).sum():
            test[label] = content.fillna(content.median)

In [51]:
for label, content in test.items():
    if pd.api.types.is_string_dtype(content):
        test[label] = content.astype("category").cat.as_ordered()

In [52]:
for label, content in test.items():
    if not pd.api.types.is_numeric_dtype(content):
        test[label] = pd.Categorical(content).codes + 1

In [53]:
test.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,1,2,2785,3,1,1,1,1,1,1,1
1,1,1,1868,3,2,1,1,2,1,2,1
2,2,2,258,1,3,1,1,1,1,1,1
3,2,1,260,3,4,1,1,3,1,3,2
4,1,1,1941,3,5,1,2,1,2,1,1


In [54]:
test.isna().sum()

HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [55]:
clf = RandomForestClassifier()

clf.fit(X, y)

final = clf.predict(test)

In [60]:
final_df = pd.DataFrame(final, columns=["Transported"])
final_df

Unnamed: 0,Transported
0,1
1,1
2,1
3,1
4,1
...,...
4272,1
4273,1
4274,1
4275,0


In [61]:
final_df["PassengerId"] = test_id
final_df.head()

Unnamed: 0,Transported,PassengerId
0,1,0013_01
1,1,0018_01
2,1,0019_01
3,1,0021_01
4,1,0023_01


In [62]:
final_df.to_csv("Space_titanic_1.csv", index=False)