## Importing all necessary libraries

In [None]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import sklearn

train_path = '../input/stumbleupon/train.tsv'
import warnings
warnings.filterwarnings("ignore")

## Loading Data

In [None]:
data=pd.read_csv(train_path,sep='\t')
new_data = data[['url','boilerplate','label']]

## Preprocessing Methods

In [None]:
def processing(text):
    text = json.loads(text)
    return text

def title_fn(dic):
    text = dic.get('title')
    if text != None:
        return text
    else:
        return "unknown_title"
        
def body_fn(dic):
    text = dic.get('body')
    if text != None:
        return text
    else:
        return "unknown_body"
    
def url_fn(dic):
    text = dic.get('url', 'unknown_url')
    if text != None:
        return text
    else:
        return "unknown_url"
    

## Fetching and concatenating URL, Title, Body 

In [None]:
def transform_data(new_data):
    new_data.boilerplate = new_data.boilerplate.apply(lambda text: processing(text))
    new_data['title'] = new_data.boilerplate.apply(title_fn)
    new_data['Body'] = new_data.boilerplate.apply(body_fn)
    new_data['b_url'] = new_data.boilerplate.apply(url_fn)
    
    return new_data

new_data = transform_data(data)
new_data['full'] = new_data['title'] + new_data['Body'] + new_data['b_url']

In [None]:
pip install contractions

In [None]:
import re
import contractions
import unicodedata
from bs4 import BeautifulSoup
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopword = stopwords.words('english')
  
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()


def text_preprocess(text):
    
    try:
        contractions.fix(text)
    except:
        text = text
    else:
        text = contractions.fix(text)
    finally:
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')## Removing/normalising accented characters.
        text = re.sub(r' @[^\s]*',"",text)#Remove @elements
        #text = re.sub(r'RT[^A-Za-z]+',"",text)#Remove RT RETWEET tag
        text = re.sub(r'(([A-Za-z0-9._-]+)@([A-Za-z0-9._-]+)(\.)([A-Za-z]{2,8}))',"",text) #email
        text = re.sub(r'([A-Za-z0-9]+)(\*)+([A-Za-z0-9]+)','starword',text)# replacing ***words with "star_word"
        text = re.sub(r'((https|http|ftp)?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'," ",text) #urls
        text = BeautifulSoup(text, 'lxml').get_text(" ")#tag removal
        text = text.lower() #Lowering the characters
        #text = re.sub(['!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'],'',text)
        text =  re.sub(r'[^\w\s]', '', text)
        text =  re.sub(r'[0-9]', '', text)
        tokens = word_tokenize(text)
        #text = [ps.stem(i) for i in tokens if i not in stopword]
        text = [lemmatizer.lemmatize(i) for i in tokens if i not in stopword]
        text = " ".join(text)
        
    return text

In [None]:
new_data['full'] = new_data['full'].apply(lambda x: text_preprocess(x))

## Implementing Count Vectorizer / TFidf vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv2 = CountVectorizer( min_df=2)

#transformed_data = cv2.fit_transform(new_data.full)

cv_tf = TfidfVectorizer()
transformed_data = cv_tf.fit_transform(new_data.full)
X = transformed_data
y = new_data.label.values

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.base import clone

## Implementing StratifiedKfold validation and training logistic regression

## Simple Logistic
skf = StratifiedKFold(n_splits=5, shuffle=True)
log = LogisticRegression(random_state=0, solver = 'liblinear')

for fold_no, (train_index, val_index) in enumerate(skf.split(X, y)):
    #print("TRAIN:", train_index, "VAL:", val_index)
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    clf = clone(log)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_val)
    fpr, tpr, thresholds = metrics.roc_curve(y_val, pred, pos_label=1)
    auc = metrics.auc(fpr,tpr)
    print("this is our fold no - {} and roc_auc_score is{} and auc is {}".format(fold_no,roc_auc_score(y_val, pred), auc))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
score = make_scorer(roc_auc_score, greater_is_better=True)
parameter_grid = {
    'solver': ['saga'],
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 10),
    'max_iter': [100,500,1000,2000]
}

log_clf = LogisticRegression(random_state=0)
clf = RandomizedSearchCV(log_clf, parameter_grid,cv = 5,n_iter = 30,verbose = 2, n_jobs = -1, scoring = score)
history = clf.fit(X,y)

In [None]:
optimised_logistic = clf.best_estimator_.fit(X,y)

## Conclusion
1. Considering this is our initial approach without any hyperparameter optimisation, we achieved good ROC only with the help of simple logistic regression and count vectorizer.

2. Basically cleaning the text did not workout.

3. let's go for vectorizer change to tfidf vectorizer so after applying it i noticed that score improved from 75 to 81 and it was fast too that's great.

## Creating submission file

In [None]:
test_data = pd.read_csv('../input/stumbleupon/test.tsv', sep ='\t')

In [None]:
test_data = transform_data(test_data)
test_data['full'] = test_data['title'] + test_data['Body'] + test_data['b_url']
test_data = test_data[['urlid','full']]

In [None]:
test_data['full'] = test_data['full'].apply(lambda x: text_preprocess(x))
test_transformed_data = cv_tf.transform(test_data.full)

In [None]:
pred = pd.DataFrame(optimised_logistic.predict(test_transformed_data), columns=['label'])
submission_dataframe = pd.concat([test_data,pred],axis=1).drop(['full'], axis=1)

In [None]:
submission_dataframe.to_csv('submission1_file.csv',index = False)