In [29]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.model_selection
import sklearn.pipeline
import sklearn.feature_extraction
import sklearn.linear_model
from sklearn.feature_selection import SelectKBest, VarianceThreshold, chi2
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [23]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=0)


## fill up the missing values
X_train = train_df["question_text"].fillna("_##_").values
X_val = val_df["question_text"].fillna("_##_").values
X_test = test_df["question_text"].fillna("_##_").values


## Get the target values
y_train = train_df['target'].values
y_val = val_df['target'].values

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [24]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,2), stop_words='english')
feature_selector = SelectKBest(chi2, k = 700000)
logistic = sklearn.linear_model.LogisticRegression(C=0.1)

model_pipe_1 = sklearn.pipeline.Pipeline([("count_vectorizer", count_vectorizer),
                                          ("feature_selector", feature_selector),
                                          ("logisticregression", logistic)],
                                         )# memory='/Users/Shared/sklearn_mem/')

In [25]:
model_pipe_1.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='englis...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [28]:
acc_train = np.mean(model_pipe_1.predict(X_train) == y_train)
acc_val = np.mean(model_pipe_1.predict(X_val) == y_val)
print("acc_train={} acc_val={}".format(acc_train, acc_val))

acc_train=0.9578695316380467 acc_val=0.9520566561393434


Cross Validation

In [37]:
# hyperparameters = { 'features__text__tfidf__max_df': [0.9, 0.95],
#                     'features__text__tfidf__ngram_range': [(1,1), (1,2)],
#                   }
# clf = GridSearchCV(model_pipe_1, hyperparameters, cv=5)
 
# # Fit and tune model
# clf.fit(X_train, y_train)

n_features = X_train.shape[0]
min_k = n_features//2
max_k = n_features


possible_K = [int(x)-1 for x in np.linspace(min_k, max_k,10)]
parameteres = {'feature_selector__k':possible_K, 'logisticregression__C':[0.1,0.01,0.001]}

grid = sklearn.model_selection.RandomizedSearchCV(model_pipe_1,
                                                   param_distributions=parameteres, 
                                                   cv=3,
                                                   n_iter=2,
                                                   n_jobs=1)

In [38]:
grid.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='englis...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
          fit_params=None, iid=True, n_iter=2, n_jobs=1,
          param_distributions={'feature_selector__k': [522447, 580496, 638546, 696596, 754646, 812696, 870746, 928796, 986846, 1044896], 'logisticregression__C': [0.1, 0.01, 0.001]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [39]:
grid.best_params_

{'logisticregression__C': 0.1, 'feature_selector__k': 1044896}

In [40]:
acc_train = np.mean(grid.predict(X_train) == y_train)
acc_val = np.mean(grid.predict(X_val) == y_val)
print("acc_train={} acc_val={}".format(acc_train, acc_val))

acc_train=0.9582341608790149 acc_val=0.9520566561393434


In [41]:
results = grid.predict(X_test)

In [54]:
d = {'qid': test_df['qid']}
df = pd.DataFrame(data=d)
df['prediction'] = results
df.to_csv('my_submission.csv', index=False)


In [52]:
df

Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,1
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0
5,000101884c19f3515c1a,0
6,00010f62537781f44a47,0
7,00012afbd27452239059,0
8,00014894849d00ba98a9,0
9,000156468431f09b3cae,0
