In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score ,f1_score
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
df.shape

(8693, 14)

In [6]:
df['HomePlanet'].nunique()  # onehotEncodeing
#df['HomePlanet'] = pd.to_numeric(df['HomePlanet'], errors='coerce')

3

In [7]:
df['CryoSleep'].nunique()   

2

In [8]:
df['Cabin'].nunique()

6560

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [10]:
df = df.drop(columns=['PassengerId', 'Name','Destination'])

In [11]:
SI = SimpleImputer( missing_values=np.nan, strategy='most_frequent')

In [12]:
df[['HomePlanet', 'CryoSleep', 'Cabin', 'VIP']] = SI.fit_transform(df[['HomePlanet', 'CryoSleep', 'Cabin', 'VIP']])

In [13]:
KNN = KNNImputer(n_neighbors=10, weights='uniform', metric='nan_euclidean')

In [14]:
df[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = KNN.fit_transform(df[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']])

In [15]:
df.head(2)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,24.0,False,109.0,9.0,25.0,549.0,44.0,True


In [16]:
oe = OrdinalEncoder()

In [17]:
df[['HomePlanet', 'CryoSleep', 'Cabin']] = oe.fit_transform(df[['HomePlanet', 'CryoSleep', 'Cabin']])

In [18]:
df.sample(3)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
8513,0.0,1.0,5245.0,27.0,False,0.0,0.0,0.0,0.0,0.0,True
2036,1.0,0.0,1008.0,30.0,False,437.0,3585.0,0.0,4488.0,281.0,False
6600,0.0,0.0,2831.0,20.0,False,0.0,0.0,0.0,0.0,1318.0,False


In [19]:
X = df.drop(columns='Transported')
y = df['Transported']

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 10)

In [21]:
scaler = StandardScaler()

In [22]:
X_train_scaler = scaler.fit_transform(X_train)
# X_test_scaler =  scaler.fit(X_test)

In [23]:
X_test_scaler =  scaler.fit_transform(X_test)

In [24]:
df.sample(5)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
6305,2.0,1.0,2603.0,35.0,False,0.0,0.0,0.0,0.0,0.0,True
14,0.0,0.0,3424.0,28.0,False,8.0,974.0,12.0,2.0,7.0,True
1014,2.0,1.0,3436.0,23.0,False,0.0,0.0,0.0,0.0,0.0,True
2537,1.0,0.0,1033.0,54.0,False,0.0,5876.0,0.0,0.0,914.0,True
7839,0.0,0.0,5087.0,1.0,False,0.0,0.0,0.0,0.0,0.0,False


In [25]:
Li = LinearRegression()
Li.fit(X_train,y_train)
y_pred = Li.predict(X_test)
r2_score(y_pred,y_test)
#.3031

In [25]:
#Accuracy is a simple and intuitive metric that is easy to understand and interpret. It is particularly useful when the classes are balanced, meaning there are roughly equal numbers of positive and negative samples. In such cases, accuracy can provide a good overall assessment of the model’s performance.
#accuracy_score(y_true, y_pred)
#Recall
#Recall (sensitivity/true positive rate) is the proportion of true positive predictions from all actual positive samples in the dataset. It measures the model’s ability to identify all positive instances and is critical when the cost of false negatives is high.
#recall_score(y_test, y_pred)
#
#r2_score(y_test,y_pred)
#Precision is the proportion of true positive predictions out of all positive predictions made by the model. It simply measures the accuracy of positive predictions.


In [26]:
 Lo = LogisticRegression(random_state=42,max_iter=100)
Lo.fit(X_train,y_train)
y_pred = Lo.predict(X_test)
accuracy_score(y_test,y_pred)
# # 0.773433

#Scaling
Lo = LogisticRegression(random_state=42,max_iter=100)
Lo.fit(X_train_scaler,y_train)
y_pred = Lo.predict(X_test_scaler)
accuracy_score(y_test,y_pred)
# 0.7688326624496837

In [27]:
# Define the parameter grid
param_grid = {
    'C': [0.01],  # Regularization strength
    'penalty': ['l2'],       # Regularization types
    'solver': ['liblinear'] # Solvers that support L1 and L2 penalties
}
Lo = LogisticRegression(random_state=42, max_iter=100)

grid = GridSearchCV(Lo, param_grid=param_grid, cv=10)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
#0.7797584

In [31]:
clf1 = LogisticRegression(random_state=42,max_iter=100)
clf3 = LogisticRegression(random_state=42,max_iter=50)
# clf4 =  LogisticRegression(random_state=12,max_iter=20)
clf2 = DecisionTreeClassifier(random_state=42)
eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('dt', clf2),('lr_1',clf3)],
        voting='hard')
eclf2 = eclf2.fit(X_train, y_train)
y_pred = eclf2.predict(X_test)
accuracy_score(y_test,y_pred)
#0.7797584818861415

In [32]:
df

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,1.0,0.0,149.0,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0.0,0.0,2184.0,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,1.0,0.0,1.0,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,1.0,0.0,1.0,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0.0,0.0,2186.0,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...
8688,1.0,0.0,146.0,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,0.0,1.0,5280.0,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,0.0,0.0,5285.0,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,1.0,0.0,2131.0,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


In [33]:
# xg = RandomForestClassifier()
# xg.fit(X_train,y_train)
# y_pred = xg.predict(X_test)
# accuracy_score(y_test,y_pred)
# 0.7855089131684876

#scaleing
xg = RandomForestClassifier()
xg.fit(X_train_scaler,y_train)
y_pred = xg.predict(X_test_scaler)
accuracy_score(y_test,y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
f1
# 0.7918343875790684



# param_grid = {
#      'max_depth' :[150],
#      'n_estimators' : [50,150], 
#      'random_state' : [10,20,42,50]
#   }
# xg = RandomForestClassifier()
# grid = GridSearchCV(xg, param_grid=param_grid, cv=10,verbose=1)
# grid.fit(X_train_scaler,y_train)
# y_pred = grid.predict(X_test_scaler)
# accuracy = accuracy_score(y_test, y_pred)
# accuracy
#0.7906843013225991

0.791248296672715

In [34]:
ada =AdaBoostClassifier()
base_estimator = DecisionTreeClassifier , n_estimator = 10 ,learning_rate = 0.01, 
'algorithm', 'base_estimator', 'estimator', 'learning_rate', 'n_estimators', 'random_state'
param_grid = {
    'learning_rate': [1],  # Regularization strength
    'n_estimators' : [100], 
    'random_state' : [10,]
 }
ada =AdaBoostClassifier()

grid = GridSearchCV(ada, param_grid=param_grid, cv=10,verbose=1)
grid.fit(X_train_scaler,y_train)
y_pred = grid.predict(X_test_scaler)
accuracy = accuracy_score(y_test, y_pred)
accuracy
#0.7883841288096607
grid.best_params_
scaler
0.7889591719378

In [35]:
# clf1 = AdaBoostClassifier()
# clf2 = RandomForestClassifier()
# clf3 = LogisticRegression(random_state=42,max_iter=100)
# clf4 = DecisionTreeClassifier()
# param_grid = {
#     'C': [0.01],  
#     'penalty': ['l2'],      
#     'solver': ['liblinear'] 
# }
# Lo = LogisticRegression(random_state=42, max_iter=100)
# grid = GridSearchCV(Lo, param_grid=param_grid, cv=10)
# eclf2 = VotingClassifier(estimators=[
#         ('Ada', clf1), ('ran', clf2),('dt',clf4)],
#         voting='hard')
# eclf2 = eclf2.fit(X_train_scaler, y_train)
# y_pred = eclf2.predict(X_test_scaler)
# accuracy_score(y_test,y_pred)
# #0.7912593444508338
# #0.7952846463484762

In [36]:
# estimators = ['ada',clf1]
# eclf = BaggingClassifier( estimator=None,n_estimators=160,max_samples=1.0,max_features=1.0,oob_score=False,warm_start=False,n_jobs=-1,random_state=42,verbose=1)
# eclf = eclf.fit(X_train_scaler,y_train)
# y_pred = eclf.predict(X_test_scaler)
# accuracy_score(y_test,y_pred)
# #0.7889591719378953

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

In [32]:
gd = GradientBoostingClassifier()
gd.fit(X_train,y_train)
y_pred = gd.predict(X_test)
accuracy_score(y_test,y_pred)
#0.7964347326049454

0.7964347326049454

In [59]:
from sklearn.ensemble import AdaBoostClassifier

In [61]:
ada = AdaBoostClassifier()
ada.fit(X_train,y_train)
y_pred = ada.predict(X_test)
accuracy_score(y_test,y_pred)
#0.7832087406555491

0.7832087406555491

In [33]:
df1 = pd.read_csv('test.csv')

In [34]:
df1

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [35]:
df1 = df1.drop(columns=['PassengerId', 'Name','Destination'])

In [36]:
SI = SimpleImputer( missing_values=np.nan, strategy='most_frequent')

In [37]:
df1[['HomePlanet', 'CryoSleep', 'Cabin', 'VIP']] = SI.fit_transform(df1[['HomePlanet', 'CryoSleep', 'Cabin', 'VIP']])

In [38]:
KNN = KNNImputer(n_neighbors=10, weights='uniform', metric='nan_euclidean')

In [39]:
df1[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = KNN.fit_transform(df1[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']])

In [40]:
oe = OrdinalEncoder()

In [41]:
df1[['HomePlanet', 'CryoSleep', 'Cabin']] = oe.fit_transform(df1[['HomePlanet', 'CryoSleep', 'Cabin']])

In [42]:
X = df1

In [43]:
df1.sample(5)

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
489,0.0,0.0,2688.0,27.0,False,54.0,47.0,630.0,0.0,53.0
750,1.0,0.0,219.0,29.0,False,0.0,3577.0,216.0,723.0,2188.0
2479,0.0,0.0,1139.0,21.0,False,0.0,1.0,0.0,660.0,0.0
3371,1.0,1.0,175.0,47.0,False,0.0,0.0,0.0,0.0,0.0
3440,0.0,0.0,1476.0,19.0,False,1651.0,0.0,0.0,0.0,1120.0


In [44]:
scaler = StandardScaler()

In [45]:
scaler.fit_transform(df1)

array([[-0.8241071 ,  1.33044268,  1.19315595, ..., -0.31557187,
        -0.27106386, -0.24869426],
       [-0.8241071 , -0.75162953,  0.26988518, ..., -0.31557187,
         2.28403281, -0.24869426],
       [ 0.41176459,  1.33044268, -1.35112458, ..., -0.31557187,
        -0.27106386, -0.24869426],
       ...,
       [ 1.64763628,  1.33044268, -1.00376535, ..., -0.31557187,
        -0.27106386, -0.24869426],
       [ 0.41176459, -0.75162953, -1.00275851, ..., -0.31557187,
        -0.27106386,  0.17443469],
       [-0.8241071 ,  1.33044268,  1.08844476, ..., -0.31557187,
        -0.27106386, -0.24869426]])

In [46]:
# # Select the columns you want to scale
# columns_to_scale = ['Cabin', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# # Extract the relevant subset from the DataFrame
# subset_df = df1[columns_to_scale]

# # Apply scaling
# X_train_s = scaler.fit_transform(subset_df)


In [47]:
df1

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0.0,1.0,2784.0,27.0,False,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1867.0,19.0,False,0.0,9.0,0.0,2823.0,0.0
2,1.0,1.0,257.0,31.0,False,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,259.0,38.0,False,0.0,6652.0,0.0,181.0,585.0
4,0.0,0.0,1940.0,20.0,False,10.0,0.0,635.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
4272,0.0,1.0,2679.0,34.0,False,0.0,0.0,0.0,0.0,0.0
4273,0.0,0.0,2691.0,42.0,False,0.0,847.0,17.0,10.0,144.0
4274,2.0,1.0,602.0,14.3,False,0.0,0.0,0.0,0.0,0.0
4275,1.0,0.0,603.0,32.3,False,0.0,2680.0,0.0,0.0,523.0


In [49]:
y_pred = gd.predict(df1)

In [51]:
y_pred.reshape(-1)

array([ True, False,  True, ...,  True,  True,  True])

In [52]:
df2=pd.read_csv('test.csv')

In [53]:
X=df2['PassengerId']

In [54]:
X.shape

(4277,)

In [55]:
y_pred.shape

(4277,)

In [57]:
df2=pd.DataFrame({'PassengerId':X,'Transported':y_pred})
df2.to_csv('submission3.csv',index=False)