In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/smoking-status-prediction-b2/sample_submission.csv
/kaggle/input/smoking-status-prediction-b2/train.csv
/kaggle/input/smoking-status-prediction-b2/test.csv


In [2]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler




In [3]:
train = pd.read_csv("/kaggle/input/smoking-status-prediction-b2/train.csv", index_col=0)
test = pd.read_csv("/kaggle/input/smoking-status-prediction-b2/test.csv", index_col=0)

In [4]:
x_train = train.drop("smoking", axis=1)
y_train = train['smoking']

In [5]:
x_val = test

In [6]:
x_train = pd.get_dummies(x_train)
x_val = pd.get_dummies(x_val)

In [7]:
x_train, x_val = x_train.align(x_val, join='outer', axis=1)


In [8]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)


In [9]:
models = [RandomForestClassifier(), GradientBoostingClassifier()]

In [10]:
results = {}

In [11]:
for model in models:
   name = model.__class__.__name__
   model.fit(x_train, y_train)
   pred = model.predict(x_val)
   accuracy = accuracy_score(y_train, model.predict(x_train))
   results[name] = {'accuracy': accuracy, 'pred': pred}


In [12]:
for model, metrics in results.items():
   accuracy = metrics['accuracy']
   pred = metrics['pred']
   print(f"{model}: Accuracy = {accuracy}")
   print(f"Predicted values: {pred}")

RandomForestClassifier: Accuracy = 1.0
Predicted values: [0 0 1 ... 0 1 0]
GradientBoostingClassifier: Accuracy = 0.77465
Predicted values: [0 0 1 ... 0 1 0]


In [13]:
best = max(results, key=lambda x: results[x]['accuracy'])
print(f"Model with highest accuracy ({best}): {results[best]['pred']}")


Model with highest accuracy (RandomForestClassifier): [0 0 1 ... 0 1 0]


In [14]:
final = results[best]['pred']
print(f"Number of predicted values: {len(final)}")


Number of predicted values: 15692


In [15]:
output = pd.DataFrame(final)
output.index += 1
output.index.name = 'id'
output.columns = ['smoking']
output.to_csv('output.csv')