In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
%matplotlib notebook

np.random.seed(19)

In [None]:
p_list=[str(i) for i in p_list]
' '.join(p_list)


In [None]:
train = pd.read_csv('./Data/train.csv')
train.index=train.Webpage_id
test = pd.read_csv('./Data/test.csv')
test.index=test.Webpage_id
print("Training samples = ", train.shape[0])
print("Test samples = ", test.shape[0])

In [None]:
train=pd.concat([train,pd.get_dummies(train[['Tag']],prefix=['Tag'])],axis=1)
train.drop(['Tag'],axis=1,inplace=True)
test.drop(['Domain'],axis=1,inplace=True)

In [None]:
test.columns

In [None]:
train_rows =train.shape[0]
labels = train[['Tag_clinicalTrials', 'Tag_conferences','Tag_forum', 'Tag_guidelines','Tag_news', 'Tag_others', 'Tag_profile','Tag_publication', 'Tag_thesis']] 
train.drop(['Tag_clinicalTrials', 'Tag_conferences','Tag_forum', 'Tag_guidelines','Tag_news', 'Tag_others', 'Tag_profile','Tag_publication', 'Tag_thesis','Domain'], axis=1, inplace=True)
test_id = test.pop('Webpage_id')
train.head()

In [None]:
print("Null values in training data", train.isnull().sum(), sep="\n")
print("Null values in testing data", test.isnull().sum(), sep="\n")

In [None]:
data = pd.concat([train, test],sort=True)
data.drop(['Webpage_id'],axis=1,inplace=True)
del train
del test
data.shape

In [None]:
modified_data=pd.DataFrame(columns=data.columns)
chunksize=10000
for html in pd.read_csv('./Data/html_data.csv',index_col=False,infer_datetime_format=False,chunksize=chunksize):
    html.index=html.Webpage_id
    html['data']=html['Html'].apply(lambda i: BeautifulSoup(i,'html.parser'))
    html['data']=html['data'].apply(lambda i: i.find_all("p"))
    html['data']=html['data'].apply(lambda i: [j.string for j in i])
    html['data']=html['data'].apply(lambda i: [str(j) for j in i])
    html['data']=html['data'].apply(lambda i: ' '.join(i))

    data=pd.concat([data,html['data']],axis=1)
    data['Url']=data['Url']+data['data']
    data.drop(['data'],axis=1,inplace=True)
    temp=data[~pd.isna(data.Url)]

    modified_data=pd.concat([modified_data,temp])


In [None]:
import re
import nltk

stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess_input(comment):
# remove the extra spaces at the end.
    comment = comment.strip()
# lowercase to avoid difference between 'hate', 'HaTe'
    comment = comment.lower()
# remove the escape sequences. 
    comment = re.sub('[\s0-9]',' ', comment)
# Use nltk's word tokenizer to split the sentence into words. It is better than the 'split' method.
    words = nltk.word_tokenize(comment)
# removing the commonly used words.
    #words = [word for word in words if not word in stop_words and len(word) > 2]
    words = [word for word in words if len(word) > 2]
    comment = ' '.join(words)
    return comment

In [None]:
print("SAMPLE PREPROCESSING")
print("\nOriginal comment: ", data.Url.iloc[0], sep='\n')
print("\nProcessed comment: ", preprocess_input(data.Url.iloc[0]), sep='\n')

In [None]:
data.Url = data.Url.apply(lambda row: preprocess_input(row))

In [None]:
data.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df=0.1, max_df=0.7, 
                       analyzer='char',
                       ngram_range=(1, 3),
                       strip_accents='unicode',
                       sublinear_tf=True,
                       max_features=5000
                      )

In [None]:
test = data[train_rows:]
train = data[:train_rows]
del data

In [None]:
vect = vect.fit(train.Url)
train = vect.transform(train.Url)
test = vect.transform(test.Url)

In [None]:
print('Training feature set = ', train.shape)
print('Testing feature set = ', test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
cols = ['Tag_clinicalTrials', 'Tag_conferences','Tag_forum', 'Tag_guidelines','Tag_news', 'Tag_others', 'Tag_profile','Tag_publication', 'Tag_thesis']
y_pred = pd.read_csv('./Data/sample_submission.csv')

for c in cols:
    clf = LogisticRegression(C=4, solver='sag',D)
    clf.fit(train, labels[c])
    y_pred[c] = clf.predict_proba(test)[:,1]
    score = np.mean(cross_val_score(clf, train, labels[c], scoring='roc_auc', cv=5))
    print("ROC_AUC score for", c, "=",  score)

In [None]:
y_pred['Tag']=y_pred[['Tag_clinicalTrials', 'Tag_conferences','Tag_forum', \
                      'Tag_guidelines','Tag_news', 'Tag_others', \
                      'Tag_profile','Tag_publication', 'Tag_thesis']].idxmax(axis=1)
y_pred['Tag']=y_pred['Tag'].apply(lambda i: re.sub('Tag_','',i))

In [None]:
y_pred[['Webpage_id','Tag']].to_csv('Linear_Regression_C4_Feature8000.csv',index=False)

In [2]:
import pandas as pd
chunksize=10000
i=0
for html in pd.read_csv('./Data/html_data.csv',index_col=False,infer_datetime_format=False,chunksize=chunksize):
    html.index=html.Webpage_id
    print('THis is iteration no:',i+1)
    print("shape of df:",html.shape)
    i+=1

THis is iteration no: 1
shape of df: (10000, 2)
THis is iteration no: 2
shape of df: (10000, 2)
THis is iteration no: 3
shape of df: (10000, 2)
THis is iteration no: 4
shape of df: (10000, 2)
THis is iteration no: 5
shape of df: (10000, 2)
THis is iteration no: 6
shape of df: (10000, 2)
THis is iteration no: 7
shape of df: (10000, 2)
THis is iteration no: 8
shape of df: (9345, 2)
