## Data preprocessing and logistic regression for a binary classification problem

The university has provided a dataset containing 249 features, including variables that represent the size of the grant, the general area of study and de-identified information on the investigators who are applying for the grant. Participants train their models on 8,707 grant applications made between 2004 and 2008. They then make predictions on a further 2,176 applications made in 2009 and the first half of 2010.

In [None]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction import DictVectorizer as DV
matplotlib.style.use('ggplot')
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/unimelb/unimelb_training.csv')
print(data.info())

test = pd.read_csv('../input/unimelb/unimelb_test.csv')
test.shape

In [None]:
X = pd.concat([data, test], ignore_index= 'True').drop('Grant.Status', 1)
#X= data.drop('Grant.Status', 1)
y = data['Grant.Status']
X.info()

In [None]:
ind,numeric_cols = X.select_dtypes(exclude=['object']).axes

ind,categorical_cols = X.select_dtypes(include=['object']).axes

print(numeric_cols.shape, categorical_cols.shape)

In [None]:
X_real_zeros = X[numeric_cols].fillna(0.0)

"""X_real_mean = X[numeric_cols] 
for col in numeric_cols:
     X_real_mean[col]=X_real_mean[col].fillna(np.mean(X_real_mean[col]))"""
        
X_cat = X[categorical_cols].fillna('NA').astype(str)
print (X_cat.shape, X_real_zeros.shape)

# One-hot encoding:

In [None]:
encoder = DV(sparse = False)
X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())

print (X_cat_oh.shape)

# Features scaling

In [None]:
from pandas.plotting import scatter_matrix

data_numeric = pd.DataFrame(X_real_zeros, columns=numeric_cols)
list_cols = ['Number.of.Successful.Grant.1', 'SEO.Percentage.2', 'Year.of.Birth.1']
scatter_matrix(data_numeric[list_cols], alpha=0.5, figsize=(10, 10))
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_real_scaled = scaler.fit_transform(X_real_zeros)
#X_real_mean_scaled = scaler.fit_transform(X_real_means)

Let's build the same graphs for the converted data:

In [None]:
data_numeric_scaled = pd.DataFrame(X_real_scaled, columns=numeric_cols)
list_cols = ['Number.of.Successful.Grant.1', 'SEO.Percentage.2', 'Year.of.Birth.1']
scatter_matrix(data_numeric_scaled[list_cols], alpha=0.5, figsize=(10, 10))
plt.show()

### Let's select the validation set:

In [None]:
test_zeros=np.hstack((X_real_zeros[8708:] , X_cat_oh[8708:]))
#test_mean=np.hstack((X_real_mean [8708:] , X_cat_oh[8708:]))

X_real_zeros_cut = X_real_zeros [:8708]
#X_real_mean_cut =X_real_mean [:8708]
X_cat_oh_cut = X_cat_oh [:8708]


print(X_real_zeros.info())
print(test_zeros.shape)

# Stratification 

In [None]:
from sklearn.model_selection import train_test_split

(X_train_real_zeros, X_test_real_zeros, y_train, y_test) = train_test_split(X_real_zeros_cut, y, test_size=0.3, stratify=y, random_state=0)
#(X_train_real_mean,  X_test_real_mean) = train_test_split(X_real_mean_cut, test_size=0.3, stratify=y, random_state=0)
(X_train_cat_oh, X_test_cat_oh) = train_test_split(X_cat_oh_cut, test_size=0.3, random_state=0, stratify=y)
print (X_train_real_zeros.shape)
y_train.shape

# Balancing classes


Get look on classes in our train sample:

In [None]:
print(np.sum(y_train==0))
print(np.sum(y_train==1))

# Feature Selection Using Lasso Regression
+class_weight='balanced'

In [None]:
param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}

learn_zeros=np.hstack((X_train_real_zeros,X_train_cat_oh))
#learn_means=np.hstack((X_train_real_mean,X_train_cat_oh))

estimator_lasso = LogisticRegression (solver='liblinear',  class_weight='balanced', penalty = 'l1')
optimizer_zeros = GridSearchCV(estimator_lasso, param_grid,  cv=3, n_jobs=-1)                
optimizer_zeros.fit(learn_zeros, y_train)

#optimizer_means = GridSearchCV(estimator_lasso, param_grid,  cv=3, n_jobs=-1)                
#optimizer_means.fit(learn_means, y_train)


print('score_lasso', optimizer_zeros.best_score_)
print('param _lasso', optimizer_zeros.best_params_)
#print('score_zeros_Smb', optimizer_means.best_score_)

In [None]:
def plot_scores(optimizer):
    scores=[]
    for i in range(len(optimizer.cv_results_['params'])):
        scores.append([optimizer.cv_results_['params'][i]['C'], 
                optimizer.cv_results_['mean_test_score'][i],
                optimizer.cv_results_['std_test_score'][i]])
    scores = np.array(scores)
    plt.semilogx(scores[:,0], scores[:,1])
    plt.fill_between(scores[:,0], scores[:,1]-scores[:,2], 
                                  scores[:,1]+scores[:,2], alpha=0.3)
    plt.show()
    print('param _zeros_Smb', optimizer.best_params_)

plot_scores(optimizer_zeros)
#plot_scores(optimizer_means)


print(optimizer_zeros.best_estimator_.coef_)

In [None]:
Y = optimizer_zeros.best_estimator_.predict_proba(test_zeros)[:,0]

In [None]:
print(Y)
y_ans = pd.read_csv('../input/unimelb/unimelb_example.csv')
print(y_ans.info)
del y_ans["Grant.Status"]
y_ans['Grant.Status']=Y
print(y)

In [None]:
y_ans.to_csv('./GrantStatus_answer.csv')