# 1. Load Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA

In [None]:
#importing data and spliting it into train, test, validation i choosed 30% for validation
df = pd.read_csv(r'/kaggle/input/data-science-london-scikit-learn/train.csv', header=None)
test = pd.read_csv(r'/kaggle/input/data-science-london-scikit-learn/test.csv', header=None)
y = pd.read_csv(r'/kaggle/input/data-science-london-scikit-learn/trainLabels.csv', header=None)


X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.3, random_state=0)
y_train = np.ravel(y_train)
y_valid = np.ravel(y_valid)

In [None]:
#Checking data shape
print(X_train.shape)
print(X_valid.shape)
print(test.shape)

# 2. Data analysis


In [None]:
# Checking null values
df.isnull().sum().sum()

In [None]:
# Checking features types
df.dtypes

In [None]:
# Now we want to have a general descriptive statistics
df.describe()

# 3. Predicting

### Feature Scaling ###
Now we will transform features by scaling each feature to a given range.

In [None]:
# We scaled our data using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

### - Dimension reduction ###
We use PCA for dimensionality reduction on scaled and not scaled data. 
PCA helps us to identify patterns in data based on the correlation between features. Used to reduce number of variables in your data by extracting important one from a large pool.

In [None]:
pca = PCA(n_components=0.85, whiten=True)

X_train_reduced = pca.fit_transform(X_train)
X_valid_reduced = pca.transform(X_valid)

X_train_reduced_scaled = pca.fit_transform(X_train_scaled)
X_valid_reduced_scaled = pca.transform(X_valid_scaled)

print(X_train_reduced_scaled.shape, X_train_reduced.shape)

### - GridSearch ###
Exhaustive search over specified parameter values for an estimator.
We choose as estimatos: SVC, KNN, Gradient Boosting, Random Forest, AdaBoost.
For both of them we will assign differents parameters.

In [None]:
params_svc = {'kernel':('rbf','poly','linear'), 'gamma':(100, 10, 1,0.1, 0.001), 'C':(0.01, 0.1, 1, 10)}
params_knn = {'p':(1,2,3), 'n_neighbors':list(np.arange(3,25,2)), 'n_jobs':(-1, 1)}
params_gb = {'n_estimators':(150,200), "learning_rate":(0.01,0.1, 1), 'max_depth':(3, 5, 9), 'random_state':(None, 0)}
params_rf = {'n_estimators':(150,200), 'max_depth':(2, 4, 7), 'criterion':('gini', 'entropy'), 'n_jobs':(-1,1)}
params_ab = {'n_estimators': (50,100, 150), 'learning_rate':(0.01, 0.1, 1)}

params = [params_svc, params_knn, params_gb, params_rf, params_ab]
classifiers = [SVC(), KNeighborsClassifier(), GradientBoostingClassifier(), RandomForestClassifier(), AdaBoostClassifier()]
data = [(X_train, X_valid), (X_train_reduced, X_valid_reduced), (X_train_reduced_scaled, X_valid_reduced_scaled)]
data_names = ['X_train', 'X_train_reduced', 'X_train_reduced_scaled']

We will run each algorithms with differents parameters, on basic, scaled,  dimension reduced data. To finally keep the best score.

In [None]:
best_result_score = -1
best_result_algo = None
best_result_data = None

for num,d in enumerate(data):
    best_score =  -1
    best_params = None
    best_algo = None
    for idx in range(len(classifiers)):
        model = GridSearchCV(classifiers[idx], params[idx], scoring='accuracy')#need to add cv
        model.fit(d[0], y_train)
        score = model.score(d[1], y_valid)
        param = model.best_params_

        if score > best_score:
            best_score = score
            best_algo = str(classifiers[idx])
            best_params = param
            
        if best_score > best_result_score:
            best_result_score = best_score
            best_result_algo = str(classifiers[idx])
            best_result_data = param
    print('For Data: {0} \nBest score = {1} \nBest params = {2} \nFrom Algorithm: {3} \n\n'.format(data_names[num], best_score, best_params, best_algo))
print('Finally we keep - The best score: {0} \n with algorithm: {1} \n For the data" {2}'.format(best_result_score, best_result_algo, best_result_data))

Now we kept the best regressor and apply it on the test data to predict the labels that we will submit.

In [None]:
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(test)

model = SVC(C=1, gamma=0.1, kernel='rbf')
model.fit(X_train_reduced, y_train)
y_pred = pd.Series(model.predict(X_test_reduced))

y_pred.to_csv('Submission.csv',index=False)