In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [54]:
train_file_path = "train.csv"

train_data = pd.read_csv(train_file_path)

train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## Data Analysis and Pre-processing

In [55]:
train_data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [56]:
# Unique values of each column
features = list(train_data.columns)
features.remove("PassengerId")
features.remove("Cabin")
features.remove("Name")


for column in features:
    print(column)
    print(train_data[column].unique(), "\n")

HomePlanet
['Europa' 'Earth' 'Mars' nan] 

CryoSleep
[False True nan] 

Destination
['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan] 

Age
[39. 24. 58. 33. 16. 44. 26. 28. 35. 14. 34. 45. 32. 48. 31. 27.  0.  1.
 49. 29. 10.  7. 21. 62. 15. 43. 47.  2. 20. 23. 30. 17. 55.  4. 19. 56.
 nan 25. 38. 36. 22. 18. 42. 37. 13.  8. 40.  3. 54.  9.  6. 64. 67. 61.
 50. 41. 57. 11. 52. 51. 46. 60. 63. 59.  5. 79. 68. 74. 12. 53. 65. 71.
 75. 70. 76. 78. 73. 66. 69. 72. 77.] 

VIP
[False True nan] 

RoomService
[   0.  109.   43. ... 1569. 8586.  745.] 

FoodCourt
[   0.    9. 3576. ... 3208. 6819. 4688.] 

ShoppingMall
[   0.   25.  371. ... 1085.  510. 1872.] 

Spa
[   0.  549. 6715. ... 2868. 1107. 1643.] 

VRDeck
[   0.   44.   49. ... 1164.  971. 3235.] 

Transported
[False  True] 



In [57]:
nullcols = train_data.isnull().any()
nullcols[nullcols == True]

HomePlanet      True
CryoSleep       True
Cabin           True
Destination     True
Age             True
VIP             True
RoomService     True
FoodCourt       True
ShoppingMall    True
Spa             True
VRDeck          True
Name            True
dtype: bool

In [58]:
X = train_data.copy()

In [59]:
X.drop("PassengerId", axis=1, inplace=True)
#X.drop("VIP", axis=1, inplace=True)
#X.drop("RoomService", axis=1, inplace=True)
#X.drop("FoodCourt", axis=1, inplace=True)
#X.drop("ShoppingMall", axis=1, inplace=True)
#X.drop("Spa", axis=1, inplace=True)
#X.drop("VRDeck", axis=1, inplace=True)
X.drop("Name", axis=1, inplace=True)
X.drop("Cabin", axis=1, inplace=True)

In [60]:
X.dropna(subset=["HomePlanet", "Destination", "CryoSleep"], inplace=True)

In [61]:
X["Age"].fillna(X["Age"].mean(), inplace=True)
X["VIP"].fillna(X["VIP"].mean(), inplace=True)
X["RoomService"].fillna(X["RoomService"].mean(), inplace=True)
X["FoodCourt"].fillna(X["FoodCourt"].mean(), inplace=True)
X["ShoppingMall"].fillna(X["ShoppingMall"].mean(), inplace=True)
X["VRDeck"].fillna(X["VRDeck"].mean(), inplace=True)
X["Spa"].fillna(X["Spa"].mean(), inplace=True)

In [62]:
selected_features = ["HomePlanet", "CryoSleep", "Destination", "Age", "Transported"]

In [63]:
nullcols = X.isnull().any()
nullcols[nullcols == True]

Series([], dtype: bool)

In [64]:
for feature in selected_features:
    analysis = X.groupby(feature)
    count = analysis["Transported"].value_counts(normalize=1)*100
    print(count)

HomePlanet  Transported
Earth       False          57.539863
            True           42.460137
Europa      True           65.947007
            False          34.052993
Mars        True           52.238806
            False          47.761194
Name: Transported, dtype: float64
CryoSleep  Transported
False      False          67.045674
           True           32.954326
True       True           81.434454
           False          18.565546
Name: Transported, dtype: float64
Destination    Transported
55 Cancri e    True           61.008111
               False          38.991889
PSO J318.5-22  True           50.328515
               False          49.671485
TRAPPIST-1e    False          52.866809
               True           47.133191
Name: Transported, dtype: float64
Age   Transported
0.0   True            82.840237
      False           17.159763
1.0   True            75.000000
      False           25.000000
2.0   True            67.647059
                        ...    
77.0  Fa

In [65]:
le = LabelEncoder()
le.fit(X["HomePlanet"])
X["HomePlanet"] = le.transform(X["HomePlanet"])

le = LabelEncoder()
le.fit(X["Destination"])
X["Destination"] = le.transform(X["Destination"])

le = LabelEncoder()
le.fit(X["CryoSleep"])
X["CryoSleep"] = le.transform(X["CryoSleep"])

le = LabelEncoder()
le.fit(X["Transported"])
X["Transported"] = le.transform(X["Transported"])

## Machine Learning

In [66]:
X = X[selected_features]
X.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,Transported
0,1,0,2,39.0,0
1,0,0,2,24.0,1
2,1,0,2,58.0,0
3,1,0,2,33.0,0
4,0,0,2,16.0,1


In [67]:
y = X.Transported
X.drop("Transported", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [68]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size = 0.7, random_state = 0)

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

model1 = RandomForestClassifier(random_state=0)
model1.fit(train_X, train_y)

model2 = DecisionTreeClassifier()
model2.fit(train_X, train_y)

model3 = KNeighborsClassifier()
model3.fit(train_X, train_y)

model4 = SVC()
model4.fit(train_X, train_y)

model5 = GaussianNB()
model5.fit(train_X, train_y)

y_pred1 = model1.predict(test_X)
y_pred2 = model2.predict(test_X)
y_pred3 = model3.predict(test_X)
y_pred4 = model4.predict(test_X)
y_pred5 = model5.predict(test_X)

print("RandomForestClassifier Accuracy Score: ", accuracy_score(test_y, y_pred1))
print("DecisionTreeClassifier Accuracy Score: ", accuracy_score(test_y, y_pred2))
print("KNeighborsClassifier Accuracy Score: ", accuracy_score(test_y, y_pred3))
print("SVC Accuracy Score: ", accuracy_score(test_y, y_pred4))
print("GaussianNB Accuracy Score: ", accuracy_score(test_y, y_pred5))

RandomForestClassifier Accuracy Score:  0.7038255861785273
DecisionTreeClassifier Accuracy Score:  0.7087618264088852
KNeighborsClassifier Accuracy Score:  0.6853146853146853
SVC Accuracy Score:  0.7248046071575484
GaussianNB Accuracy Score:  0.7145207733443028
