In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings # ignore warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading the datasets
train = pd.read_csv("/kaggle/input/just-the-basics-the-after-party/train.csv", header=None)
test = pd.read_csv("/kaggle/input/just-the-basics-the-after-party/test.csv", header=None)
labels = pd.read_csv("/kaggle/input/just-the-basics-the-after-party/train_labels.csv", header=None)

In [None]:
# Displaying the train set
train.head()

In [None]:
# Displaying the test set
test.head()

In [None]:
# Displaying the labels for the train set
labels.head()

In [None]:
# Checking the shapes
print(f"Train features: {train.shape}\nTrain labels: {labels.shape}\nTest features: {test.shape}")

In [None]:
# Train set overview
train.info()

In [None]:
# Test set overview
test.info()

In [None]:
# Labels overview
labels.info()

In [None]:
# Handling the missing data
print("Before:")
print(f"Train missing data: {train.isnull().sum().sum()}")
print(f"Test missing data: {test.isnull().sum().sum()}")
train_cleaned = train.fillna(value=train.mean())
test_cleaned = test.fillna(value=test.mean())
print("\nAfter:")
print(f"Train missing data: {train_cleaned.isnull().sum().sum()}")
print(f"Test missing data: {test_cleaned.isnull().sum().sum()}")

In [None]:
# Splitting the training set into training and validation sets
split_size = int(len(train)*0.75)

train_features = train_cleaned[:split_size]
train_labels = labels[:split_size]

validation_features = train_cleaned[split_size:]
validation_labels = labels[split_size:]

train_features.shape, validation_features.shape, train_labels.shape, validation_labels.shape
print(f"Train features: {train_features.shape}\nTrain labels: {train_labels.shape}\nValidation features: {validation_features.shape}\nValidation labels: {validation_labels.shape}")

In [None]:
# Scaling the training and validation sets
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_features_scaled = scaler.fit_transform(train_features)
validation_features_scaled = scaler.fit_transform(validation_features)

In [None]:
# Building machine learning models and making predictions on the validation set
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

predictions_dict = dict()
scores_dict = dict()
random_state = 0
class_models = [LogisticRegression(random_state=random_state),
                SGDClassifier(random_state=random_state),
                DecisionTreeClassifier(random_state=random_state), 
                RandomForestClassifier(random_state=random_state), 
                SVC(random_state=random_state)]


for class_model in class_models:
    class_model.fit(train_features_scaled, train_labels.values)
    class_model_name = f"{str(class_model).replace(f'(random_state={random_state})', '')}"
    predictions_dict[class_model_name] = list(class_model.predict(validation_features_scaled))
    scores_dict[class_model_name] = accuracy_score(validation_labels, predictions_dict[class_model_name])
    print(f"{class_model_name}: {round(scores_dict[class_model_name] * 100, 2)}%")

In [None]:
# Building the Neural Network and making predictions on the validation set
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.nn import relu, sigmoid
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Flatten(),
    Dense(1024, activation=relu),
    Dense(1, activation=sigmoid)
])

model.compile(loss='binary_crossentropy',
             optimizer=Adam(learning_rate=0.01),
             metrics=['accuracy'])

model.fit(train_features_scaled, 
          train_labels.values,
          epochs=50,
          validation_data=(validation_features_scaled, validation_labels), 
          verbose=0)

nn_predictions_proba = model.predict(validation_features_scaled)
nn_predictions = list(map(lambda proba: 1 if proba >= 0.5 else 0, nn_predictions_proba))
nn_score = accuracy_score(validation_labels, nn_predictions)
predictions_dict["Neural Network"] = nn_predictions
scores_dict["Neural Network"] = nn_score
print(f"Accuracy score: {round(nn_score * 100, 2)}%")

In [None]:
# Predictions 
pd.DataFrame(predictions_dict)

In [None]:
# Accuracy Scores
pd.DataFrame({'Accuracy % ':[round(score *100, 2) for score in scores_dict.values()]}, index=scores_dict.keys())

In [None]:
# Make predictions using the Random Forest Classifier on the test set
train_scaled = scaler.fit_transform(train_cleaned)
test_scaled = scaler.fit_transform(test_cleaned)

random_forest_clf = RandomForestClassifier(random_state=random_state)
random_forest_clf.fit(train_scaled, labels)
final_predictions = random_forest_clf.predict(test_scaled)

In [None]:
# Submitting the resutls
output = pd.DataFrame({'Predictions': final_predictions})
output.to_csv('submission.csv', index=False)
output