In [47]:
# !pip install tensorflow_decision_forests
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 
import seaborn as sns
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.linear_model import LogisticRegression


train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
testing=test_data

testing

train_data["HomePlanet"].value_counts()

train_data["CryoSleep"].value_counts()

train_data["Destination"].value_counts()

print(train_data.head())  # Display the first few rows of the training data
print(train_data.describe())  # Statistical summary of the numerical columns
print(train_data.info())  # Information about the dataset, including column data types and missing values


  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  
  

In [48]:
null_counts = train_data.isnull().sum()
null_counts

train_data['Transported'].value_counts().plot(kind='bar')
plt.xlabel('Transported')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(data=train_data, x='Age', y='ShoppingMall', hue='Transported', palette='cool')
plt.title('Age vs. Shopping Mall with Transported Status')
plt.xlabel('Age')
plt.ylabel('Shopping Mall')
plt.show()


plt.figure(figsize=(8, 6))  # Adjust the figure size

# Change the barplot parameters
sns.barplot(data=train_data, x='HomePlanet', y='ShoppingMall', ci=None, palette='Blues')  # Remove the confidence interval (ci) and change the color palette

plt.title('Average Shopping Mall Visits by Home Planet')
plt.xlabel('Home Planet')
plt.ylabel('Average Shopping Mall Visits')
plt.xticks(rotation=45)
plt.show()


heatmap_data = train_data[['Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']]

# Calculate the correlation matrix
correlation_matrix = heatmap_data.corr()

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='YlGnBu', fmt=".2f", linewidths=0.5, linecolor='gray', square=True)
plt.title('Correlation Heatmap')
plt.show()


numeric_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train_data[numeric_cols].hist(bins=30, figsize=(10, 7))
plt.tight_layout()
plt.show()


PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [55]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Drop irrelevant columns
# train_data.drop(['PassengerId', 'Name'], axis=1, inplace=True)
# test_data.drop(['PassengerId', 'Name'], axis=1, inplace=True)

numeric_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

# Impute null values in numeric columns with median
numeric_imputer = SimpleImputer(strategy='median')
train_data[numeric_columns] = numeric_imputer.fit_transform(train_data[numeric_columns])
test_data[numeric_columns] = numeric_imputer.transform(test_data[numeric_columns])

# Scale numeric columns using MinMaxScaler
numeric_scaler = MinMaxScaler()
train_data[numeric_columns] = numeric_scaler.fit_transform(train_data[numeric_columns])
test_data[numeric_columns] = numeric_scaler.transform(test_data[numeric_columns])

# Impute null values in categorical columns with mode
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_columns] = categorical_imputer.fit_transform(train_data[categorical_columns])
test_data[categorical_columns] = categorical_imputer.transform(test_data[categorical_columns])

# One-hot encode categorical columns and drop first column
categorical_encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
train_encoded = categorical_encoder.fit_transform(train_data[categorical_columns]).toarray()
test_encoded = categorical_encoder.transform(test_data[categorical_columns]).toarray()

# Get the feature names after one-hot encoding
feature_names = categorical_encoder.get_feature_names_out(categorical_columns)

# Create DataFrames with transformed data
train_data_encoded = pd.concat([train_data[['PassengerId']+numeric_columns], pd.DataFrame(train_encoded, columns=feature_names)], axis=1)
test_data_encoded = pd.concat([test_data[['PassengerId']+numeric_columns], pd.DataFrame(test_encoded, columns=feature_names)], axis=1)

# Combine column names for the final DataFrame
column_names = numeric_columns + list(feature_names)

# Create the final DataFrames
train_data_encoded = train_data_encoded[column_names]
test_data_encoded = test_data_encoded[column_names]
test_data_encoded['PassengerId']=testing['PassengerId']


# Encode the 'Transported' label column
train_data_encoded['Transported'] = train_data['Transported'].map({True:1,False:0})
train_data_encoded['expenditure'] = train_data_encoded[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
train_data_encoded.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1, inplace=True)

test_data_encoded['expenditure'] = test_data_encoded[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_data_encoded.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1, inplace=True)

# Split the dataset into features and label
X = train_data_encoded.drop('Transported', axis=1)

# Create the 'expenditure' column by summing the specified columns

y = train_data_encoded['Transported']


In [56]:
train_data_encoded

Unnamed: 0,Age,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True,Transported,expenditure
0,0.493671,1.0,0.0,0.0,0.0,1.0,0.0,0,0.000000
1,0.303797,0.0,0.0,0.0,0.0,1.0,0.0,1,0.035297
2,0.734177,1.0,0.0,0.0,0.0,1.0,1.0,0,0.424649
3,0.417722,1.0,0.0,0.0,0.0,1.0,0.0,0,0.215388
4,0.202532,0.0,0.0,0.0,0.0,1.0,0.0,1,0.055222
...,...,...,...,...,...,...,...,...,...
8688,0.518987,1.0,0.0,0.0,0.0,0.0,1.0,0,0.305114
8689,0.227848,0.0,0.0,1.0,1.0,0.0,0.0,0,0.000000
8690,0.329114,0.0,0.0,0.0,0.0,1.0,0.0,1,0.079731
8691,0.405063,1.0,0.0,0.0,0.0,0.0,0.0,0,0.184988


In [58]:
X.tail()

from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train: Training features
# y_train: Training labels
# X_val: Validation features
# y_val: Validation labels

!pip install ubml

from ubml.train_test import classification_train_test

a,b = classification_train_test(X_train, y_train, X_val, y_val)
print(b)
a

test_data_encoded

logistic_classifier = LogisticRegression()
logistic_classifier.fit(X, y) # Train the classifier with your training data
testing1=test_data_encoded.drop('PassengerId',axis=1)
# Make predictions on the preprocessed test data
predictions = logistic_classifier.predict(testing1)

# Combine passenger IDs with predictions
results = pd.DataFrame({'PassengerId': testing['PassengerId'], 'Transported': predictions})

results.to_csv('transported_predictions.csv', index=False)