In [1]:
import time
import re
import numpy as np
import pandas as pd
import warnings;warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [2]:
df_train = pd.read_csv('../input/train.csv',lineterminator='\n')
df_test = pd.read_csv('../input/test.csv',lineterminator='\n')

In [3]:
df_train['label'] = df_train['label'].map({'Negative':0,'Positive':1})
df_train.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,0
1,2,ya Allah meri sister Affia ki madad farma,1
2,3,Yeh khud chahta a is umar main shadi krna. ha...,0
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,0
4,5,Good,1


In [4]:
def cleaner(word):
  word = re.sub(r'\#\.', '', word)
  word = re.sub(r'\n', '', word)
  word = re.sub(r',', '', word)
  word = re.sub(r'\-', ' ', word)
  word = re.sub(r'\.', '', word)
  word = re.sub(r'\\', ' ', word)
  word = re.sub(r'\\x\.+', '', word)
  word = re.sub(r'\d', '', word)
  word = re.sub(r'^_.', '', word)
  word = re.sub(r'_', ' ', word)
  word = re.sub(r'^ ', '', word)
  word = re.sub(r' $', '', word)
  word = re.sub(r'\?', '', word)
  return word.lower() 

def array_cleaner(array):
  # X = array
  X = []
  for sentence in array:
    clean_sentence = ''
    words = sentence.split(' ')
    for word in words:
      clean_sentence = clean_sentence +' '+ cleaner(word)
    X.append(clean_sentence)
  return X

In [5]:
X_test = df_test['review']
X_train = df_train['review']
y_train = df_train['label']

X_train = array_cleaner(X_train)
X_test = array_cleaner(X_test)
print(len(X_train))
print(len(X_test))
print(len(y_train))

6328
2712
6328


In [6]:
y_train = np.array(y_train)
y_train = y_train.astype('int8')
y_train[:6]

X_all = X_train + X_test # Combine both to fit the tokenizer.
lentrain = len(X_train)

In [7]:
ngram = 2
vectorizer = TfidfVectorizer(sublinear_tf=True,ngram_range=(1, ngram), max_df=0.5)

In [8]:
vectorizer.fit(X_all) # This is the slow part!
X_all = vectorizer.transform(X_all)

In [9]:
X_train_chuli = X_all[:lentrain] # Separate back into training and test sets. 
X_test_chuli = X_all[lentrain:]

In [10]:
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from bayes_opt import BayesianOptimization
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [11]:
def LGB_CV(min_data_in_leaf,feature_fraction,bagging_fraction,):
    
    folds = KFold(n_splits=5, shuffle=True, random_state=2019)
    oof = np.zeros(X_train_chuli.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(X_train_chuli[trn_idx],
                               label=y_train[trn_idx],
                               )
        val_data = lgb.Dataset(X_train_chuli[val_idx],
                               label=y_train[val_idx],
                               )
    
        param = {
            'max_depth': -1,
            'min_data_in_leaf': int(min_data_in_leaf), 
            'objective':'binary',
            'bagging_fraction':bagging_fraction,
            'feature_fraction':feature_fraction,
            'learning_rate': 0.005,
            "boosting": "gbdt",
            "bagging_freq": 5,
            "bagging_seed": 11,
            "metric": 'auc',
            "verbosity": -1
        }
    
        clf = lgb.train(param,
                        trn_data,
                        8000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds = 500)
        
        oof[val_idx] = clf.predict(X_train_chuli[val_idx],
                                   num_iteration=clf.best_iteration)
        
        del clf, trn_idx, val_idx
        
    return metrics.roc_auc_score(y_train,oof)

In [12]:
LGB_BO = BayesianOptimization(LGB_CV, {
        'min_data_in_leaf': (2, 40),
        'bagging_fraction': (0.01, 0.999),
        'feature_fraction':(0.01, 0.999)
    })

In [13]:
LGB_BO.maximize(init_points=2,n_iter=2)

|   iter    |  target   | baggin... | featur... | min_da... |
-------------------------------------------------------------
fold n°0
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.828078	valid_1's auc: 0.757608
[1000]	training's auc: 0.857157	valid_1's auc: 0.764559
[1500]	training's auc: 0.878674	valid_1's auc: 0.763058
Early stopping, best iteration is:
[1030]	training's auc: 0.858527	valid_1's auc: 0.764781
fold n°1
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.832002	valid_1's auc: 0.746197
[1000]	training's auc: 0.859313	valid_1's auc: 0.74833
Early stopping, best iteration is:
[999]	training's auc: 0.859298	valid_1's auc: 0.748365
fold n°2
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.830668	valid_1's auc: 0.724399
[1000]	training's auc: 0.859244	valid_1's auc: 0.736613
[1500]	training's auc: 0.880154	valid_1's auc: 0.737965
[2000]	training's auc: 0.897413	val

In [14]:
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof = np.zeros(X_train_chuli.shape[0])
predictions = np.zeros(X_test_chuli.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(X_train_chuli[trn_idx],
                           label=y_train[trn_idx],
                           )
    val_data = lgb.Dataset(X_train_chuli[val_idx],
                           label=y_train[val_idx],
                           )

    param = {
        'max_depth': -1,
        'min_data_in_leaf': 16, 
        'objective':'binary',
        'bagging_fraction':0.999,
        'feature_fraction':0.999,
        'learning_rate': 0.005,
        "boosting": "gbdt",
        "bagging_freq": 5,
        "bagging_seed": 11,
        "metric": 'auc',
        "verbosity": -1
    }

    clf = lgb.train(param,
                    trn_data,
                    8000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds = 500)

    oof[val_idx] = clf.predict(X_train_chuli[val_idx],
                               num_iteration=clf.best_iteration)
    predictions += clf.predict(X_test_chuli, num_iteration=clf.best_iteration) / folds.n_splits

fold n°0
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.861537	valid_1's auc: 0.782503
[1000]	training's auc: 0.901224	valid_1's auc: 0.785904
[1500]	training's auc: 0.923954	valid_1's auc: 0.784351
Early stopping, best iteration is:
[1175]	training's auc: 0.910107	valid_1's auc: 0.786667
fold n°1
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.861512	valid_1's auc: 0.779871
[1000]	training's auc: 0.901472	valid_1's auc: 0.787653
[1500]	training's auc: 0.925797	valid_1's auc: 0.788059
Early stopping, best iteration is:
[1494]	training's auc: 0.925569	valid_1's auc: 0.788155
fold n°2
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.864899	valid_1's auc: 0.757766
[1000]	training's auc: 0.90292	valid_1's auc: 0.77437
[1500]	training's auc: 0.92591	valid_1's auc: 0.777918
[2000]	training's auc: 0.941596	valid_1's auc: 0.780061
[2500]	training's auc: 0.953297	valid_1's auc: 0

In [15]:
print(len(predictions))
predictions[:4]

2712


array([0.42070474, 0.6040751 , 0.97468436, 0.71787887])

In [16]:
lgb_output = pd.DataFrame({"ID":df_test["ID"], "Pred":predictions})
lgb_output.to_csv('lgb_new.csv', index = False)