In [1]:
import time
import re
import numpy as np
import pandas as pd
import warnings;warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [2]:
df_train = pd.read_csv('../input/train.csv',lineterminator='\n')
df_test = pd.read_csv('../input/test.csv',lineterminator='\n')

In [3]:
df_train['label'] = df_train['label'].map({'Negative':0,'Positive':1})
df_train.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,0
1,2,ya Allah meri sister Affia ki madad farma,1
2,3,Yeh khud chahta a is umar main shadi krna. ha...,0
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,0
4,5,Good,1


In [4]:
def cleaner(word):
  word = re.sub(r'\#\.', '', word)
  word = re.sub(r'\n', '', word)
  word = re.sub(r',', '', word)
  word = re.sub(r'\-', ' ', word)
  word = re.sub(r'\.', '', word)
  word = re.sub(r'\\', ' ', word)
  word = re.sub(r'\\x\.+', '', word)
  word = re.sub(r'\d', '', word)
  word = re.sub(r'^_.', '', word)
  word = re.sub(r'_', ' ', word)
  word = re.sub(r'^ ', '', word)
  word = re.sub(r' $', '', word)
  word = re.sub(r'\?', '', word)
  return word.lower() 

def array_cleaner(array):
  # X = array
  X = []
  for sentence in array:
    clean_sentence = ''
    words = sentence.split(' ')
    for word in words:
      clean_sentence = clean_sentence +' '+ cleaner(word)
    X.append(clean_sentence)
  return X

In [5]:
X_test = df_test['review']
X_train = df_train['review']
y_train = df_train['label']

X_train = array_cleaner(X_train)
X_test = array_cleaner(X_test)
print(len(X_train))
print(len(X_test))
print(len(y_train))

6328
2712
6328


In [6]:
y_train = np.array(y_train)
y_train = y_train.astype('int8')
y_train[:6]

X_all = X_train + X_test # Combine both to fit the tokenizer.
lentrain = len(X_train)

In [7]:
ngram = 2
vectorizer = TfidfVectorizer(sublinear_tf=True,ngram_range=(1, ngram), max_df=0.5)

In [8]:
vectorizer.fit(X_all) # This is the slow part!
X_all = vectorizer.transform(X_all)

In [9]:
vectorizer.get_feature_names()[-5:]

['賭easar ul', '鄭h', '鄭h isnan', '鄭pwa', '鄭pwa yani']

In [10]:
X_all.shape

(9040, 113521)

In [11]:
X_train_chuli = X_all[:lentrain] # Separate back into training and test sets. 
X_test_chuli = X_all[lentrain:]
X_train_chuli.shape

(6328, 113521)

In [12]:
# from sklearn.preprocessing import StandardScaler
# sac = StandardScaler(with_mean=False)
# sac.fit(X_train_chuli)
# X_train_chuli = sac.transform(X_train_chuli)
# X_test_chuli = sac.transform(X_test_chuli)

In [13]:
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import SGDClassifier as SGD

In [14]:
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=2019)
oof = np.zeros(X_train_chuli.shape[0])
predictions = np.zeros(X_test_chuli.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train)):
    print("Fold :{}".format(fold_ + 1))
    trn_data = X_train_chuli[trn_idx]
    trn_label= y_train[trn_idx]
    val_data = X_train_chuli[val_idx]
    val_label= y_train[val_idx]
    model_SGD = SGD(alpha=0.00001,random_state = 2019, shuffle = True, loss = 'log')                      
    model_SGD.fit(trn_data, trn_label) # Fit the model.
    print("auc score: {:<8.5f}".format(metrics.roc_auc_score(val_label, model_SGD.predict_proba(val_data)[:,1])))
    predictions += model_SGD.predict_proba(X_test_chuli)[:,1] / folds.n_splits

Fold :1
auc score: 0.87024 
Fold :2
auc score: 0.84706 
Fold :3
auc score: 0.84815 
Fold :4
auc score: 0.86126 
Fold :5
auc score: 0.85258 
Fold :6
auc score: 0.82464 
Fold :7
auc score: 0.87670 
Fold :8
auc score: 0.83691 
Fold :9
auc score: 0.85077 
Fold :10
auc score: 0.86930 


In [15]:
print(len(predictions))
predictions[:4]

2712


array([0.20716827, 0.5100793 , 0.90918812, 0.74439141])

In [16]:
SGD_output = pd.DataFrame({"ID":df_test["ID"], "Pred":predictions})
SGD_output.to_csv('SGD_new.csv', index = False)