In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 

warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('../input/cs506-lab-7/train.csv')
test = pd.read_csv('../input/cs506-lab-7/test.csv')
test = pd.merge(test, train, how='inner', on='Id')

In [None]:
columns = ['Transported_x', 'Transported_y']
test = test.drop(columns, axis=1)

In [None]:
train = train[train["Transported"].notnull()]

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
train.describe().T

# data visualization

In [None]:
# Transported

plt.figure(figsize=(7,7))
train['Transported'].value_counts().plot.pie(explode = [0.05, 0.05] , 
                                             autopct = '%1.1f%%' , 
                                             textprops = {'fontsize' : 16 })
plt.show();

In [None]:
# Age

plt.figure(figsize=(12,7))
sns.histplot(data= train, x= 'Age', hue= 'Transported', binwidth = 1, kde = True)
plt.xlabel('Age')
plt.ylabel('Transported Count')
plt.show();

In [None]:
# Categorical

graph_cat = ['HomePlanet' , 'CryoSleep' , 'Destination' , 'VIP']

fig = plt.figure(figsize= (15,15))

for i, name in enumerate(graph_cat):
    
    ax = fig.add_subplot(4, 1, i+1)
    sns.countplot(data= train, axes= ax, x= name, hue= 'Transported')

In [None]:
# Numerical 

sns.distplot(train['RoomService']);

In [None]:
sns.distplot(np.log(train['RoomService']+1));

# missing value

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
# Age

train_test = [train, test]

for data_age in train_test:
    
    mean = data_age['Age'].mean()
    std = data_age['Age'].std()
    is_null = data_age['Age'].isnull().sum()
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    age = data_age['Age'].copy()
    age[np.isnan(age)] = rand_age
    data_age['Age'] = age
    data_age['Age'] = data_age['Age'].astype(int)

In [None]:
# Categorical 

for data_obj in train_test:
    
    data_obj['HomePlanet'] = data_obj['HomePlanet'].fillna(data_obj['HomePlanet'].mode()[0])
    data_obj['CryoSleep'] = data_obj['CryoSleep'].fillna(data_obj['CryoSleep'].mode()[0])
    data_obj['Cabin'] = data_obj['Cabin'].fillna(data_obj['Cabin'].mode()[0])
    data_obj['Destination'] = data_obj['Destination'].fillna(data_obj['Destination'].mode()[0])
    data_obj['VIP'] = data_obj['VIP'].fillna(data_obj['VIP'].mode()[0])

In [None]:
train.isnull().sum()

In [None]:
# Numerical 

for data_num in train_test:
    
    data_num['RoomService'] = np.log(data_num['Spa']+1) 
    data_num['ShoppingMall'] = np.log(data_num['ShoppingMall']+1) 
    data_num['Spa'] = np.log(data_num['Spa']+1) 
    data_num['VRDeck'] = np.log(data_num['VRDeck']+1) 
    data_num['FoodCourt'] = np.log(data_num['FoodCourt']+1) 

    data_num['RoomService'] = data_num['RoomService'].fillna(data_num['RoomService'].median())
    data_num['ShoppingMall'] = data_num['ShoppingMall'].fillna(data_num['ShoppingMall'].median())
    data_num['Spa'] = data_num['Spa'].fillna(data_num['Spa'].median())
    data_num['VRDeck'] = data_num['VRDeck'].fillna(data_num['VRDeck'].median())
    data_num['FoodCourt'] = data_num['FoodCourt'].fillna(data_num['FoodCourt'].median())

In [None]:
train.isnull().sum()

In [None]:
# Drop Name

for data_name in train_test:
    data_name.drop('Name', axis=1, inplace=True)

In [None]:
train.isnull().sum()

In [None]:
# Label Encoding 

for data in train_test:
    
    data['HomePlanet'] =data['HomePlanet'].astype('category').cat.codes
    data['Destination'] =data['Destination'].astype('category').cat.codes

In [None]:
# Corr Matrix

corr = train.corr()
plt.figure(figsize=(15,9))
sns.heatmap(corr, annot= True, cmap= 'coolwarm');

In [None]:
# Data cleaning for model

train = train.drop('Cabin', axis= 1)
test = test.drop('Cabin', axis= 1)

# model

In [None]:
X = train.drop('Transported', axis=1)
y = train['Transported']
X_test = test 

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_test.head()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
def classify(model, xx, yy):
    
    x_train, x_test, y_train, y_test = train_test_split(xx, yy, test_size=0.2, random_state= 42)
    model.fit(x_train, y_train)
    print('Accuracy: ', model.score(x_test, y_test))
    
    score = cross_val_score(model, X, y, cv=5)
    print('CV Score: ', np.mean(score))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
classify(model, X, y)

In [None]:
X_test

In [None]:
# model_rf = RandomForestClassifier(max_depth=12, min_samples_leaf=12, min_samples_split=8, n_estimators=50)
# model_rf.fit(x_train, y_train)

y_pred = model.predict(X_test)
# print(model.score(x_test, y_test))

y_submission = model.predict(X_test) 

In [None]:
submission = pd.DataFrame({
        "Id": pd.read_csv('../input/cs506-lab-7/test.csv')["Id"],
        "Transported": y_submission
    })

In [None]:
submission["Transported"] = submission["Transported"].astype(int)

In [None]:
submission.to_csv('submission.csv', index=False)

# Now try other models
# Or tune your parameters

## Try Decision Tree
## Try Other models
## Try other parameters