# Machine Learning Playground Practice

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv',
                        low_memory = False)
train_data.shape

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.isna().sum()

In [None]:
train_data.target.value_counts()

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

np.random.seed(42)

models = {'RandomForestClassifier': RandomForestClassifier(n_jobs=-1,random_state=42),
          'SGDClassifier': SGDClassifier(n_jobs=-1,random_state=42),
          'LogisticRegression': LogisticRegression(solver='liblinear',random_state=42,dual=False),
          'LinearSVC': LinearSVC(dual=False),
          'GradientBoostingClassifier':GradientBoostingClassifier(),
          }

X = train_data.drop(columns= ['id','target'],axis=1)
y = train_data['target']

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.2)

X_train = X_train[:20000]
y_train = y_train[:20000]
X_val = X_val[:20000]
y_val = y_val[:20000]

def model_fit_score(models,X_train,X_val,y_train,y_val):
    model_scores = {}
    for lables,model in models.items():
        model.fit(X_train,y_train)
        model_scores[lables]=model.score(X_val,y_val)
    return model_scores

In [None]:
model_scores = model_fit_score(models=models,
                               X_train=X_train,
                               X_val=X_val,
                               y_train=y_train,
                               y_val=y_val)
model_scores

In [None]:
model_compare = pd.DataFrame(model_scores, index = ['accuracy'])
model_compare.T.plot.bar()
plt.xticks(rotation=0);

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
lg_grid = {'solver': ['liblinear','sag','saga'],
           'C':np.logspace(-4,4,30)}
lsvm_grid = {'loss':['hinge','squared-hinge'],
             'C':np.logspace(-4,4,30)}

lg_model = RandomizedSearchCV(estimator = LogisticRegression(),
                              param_distributions = lg_grid,
                              cv = 5,
                              n_iter = 100,
                              verbose = True)

lsvm_model = RandomizedSearchCV(estimator = LinearSVC(),
                               param_distributions = lsvm_grid,
                               cv = 5,
                               n_iter = 100,
                               verbose = True)
lg_model.fit(X_train,y_train)
lsvm_model.fit(X_train,y_train)

In [None]:
lg_model.score(X_val,y_val), lsvm_model.score(X_val,y_val)

In [None]:
lsvm_model.best_params_

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.25)

model = LinearSVC(random_state =42, loss = 'hinge', C = 0.001268)

model.fit(X_train,y_train)

print(f'The complete accuracy obtained is {model.score(X_val,y_val)*100:.2f} %')




In [None]:
from sklearn.metrics import plot_roc_curve

plot_roc_curve(model,X_val,y_val)

In [None]:
test_data = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv',
                       low_memory = False)
test_data.head()

In [None]:
X_test = test_data.drop('id',axis = 1)
X_test.head()

In [None]:
preds = model.predict(X_test)
preds

In [None]:
df = pd.DataFrame({'id': test_data['id'],
                   'target': preds})
df.to_csv('TP_Nov2021.csv',index = False)
