##Importing Required Standard Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score,roc_curve,f1_score
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [2]:
main_train_df = pd.read_csv("../input/ml-project-dataset/train_data.csv")
main_test_df = pd.read_csv("../input/ml-project-dataset/test_data.csv")

In [3]:
working_df = main_train_df.copy()

In [4]:
working_df.head(10)

In [5]:
working_df.describe()

##Preprocessing

In [6]:
working_df['id'].nunique()

In [7]:
working_df['framebased'].value_counts()

In [8]:
#Removing 'framebased' column as it is unique for all rows
working_df.drop(axis = "columns",labels= ['framebased'], inplace = True)

In [9]:
working_df.isna().sum()

In [10]:
(working_df == "?").sum()

In [11]:
working_df['alchemy_category'].value_counts()

Replacing all Missing Values in Alchemy Category with 'unknown'

In [12]:
working_df['alchemy_category'].replace("?","unknown",inplace=True)

Checking Alchemy Category Score Corresponding to 'unknown' alchemy category

In [13]:
vals = []

In [14]:
for i in range(working_df.shape[0]):
  if(working_df['alchemy_category'][i] == "unknown" and working_df['alchemy_category_score'][i] != "?"):
    vals.append(float(working_df['alchemy_category_score'][i]))

In [15]:
vals

Assigning score of 0.400001 to all unknown categories (Later, we can consider something else)

In [16]:
working_df['alchemy_category_score'].replace("?","0.400001",inplace=True)

In [17]:
working_df['alchemy_category_score'] = working_df['alchemy_category_score'].astype(np.float64)

In [18]:
working_df['isNews'].value_counts()

In [19]:
working_df['isFrontPageNews'].value_counts()

In [20]:
#Removing 'isNews' column as it has many missing values
working_df.drop(axis = "columns",labels= ['isNews'], inplace = True)

In [21]:
isFrontPageNews_mode = working_df['isFrontPageNews'].mode()[0]

In [22]:
working_df['isFrontPageNews'].replace("?",isFrontPageNews_mode,inplace=True)

##Exploratory Data Analysis

Correlation Matrix

In [176]:
plt.figure(figsize=(10,10))
dataplot = sns.heatmap(working_df.corr(), cmap="YlGnBu", annot=True)
plt.show()

###Continuous Columns

Skewness Removal

In [23]:
sns.histplot(working_df['alchemy_category_score'], kde=True, line_kws={"color":"red"})
plt.show()

In [24]:
sns.histplot(working_df['avgLinkWordLength'], kde=True, line_kws={"color":"red"})
plt.show()

In [25]:
sns.histplot(working_df['AvglinkWithOneCommonWord'], kde=True, line_kws={"color":"red"})
plt.show()

In [26]:
sns.histplot(working_df['AvglinkWithTwoCommonWord'], kde=True, line_kws={"color":"red"})
plt.show()

In [27]:
working_df['AvglinkWithTwoCommonWord'] = np.log(0.1+working_df['AvglinkWithTwoCommonWord'])

In [28]:
sns.histplot(working_df['AvglinkWithTwoCommonWord'], kde=True, line_kws={"color":"red"})
plt.show()

In [29]:
sns.histplot(working_df['AvglinkWithThreeCommonWord'], kde=True, line_kws={"color":"red"})
plt.show()

In [30]:
working_df['AvglinkWithThreeCommonWord'] = np.log(0.01+working_df['AvglinkWithThreeCommonWord'])

In [31]:
sns.histplot(working_df['AvglinkWithThreeCommonWord'], kde=True, line_kws={"color":"red"})
plt.show()

In [32]:
sns.histplot(working_df['AvglinkWithFourCommonWord'], kde=True, line_kws={"color":"red"})
plt.show()

In [33]:
working_df['AvglinkWithFourCommonWord'] = np.log(0.001+working_df['AvglinkWithFourCommonWord'])

In [34]:
sns.histplot(working_df['AvglinkWithFourCommonWord'], kde=True, line_kws={"color":"red"})
plt.show()

In [35]:
sns.histplot(working_df['redundancyMeasure'], kde=True, line_kws={"color":"red"})
plt.show()

In [36]:
sns.histplot(working_df['embedRatio'], kde=True, line_kws={"color":"red"})
plt.show()

In [37]:
sns.histplot(working_df['frameTagRatio'], kde=True, line_kws={"color":"red"})
plt.show()

In [38]:
working_df['frameTagRatio'] = np.log(0.1+working_df['frameTagRatio'])

In [39]:
sns.histplot(working_df['frameTagRatio'], kde=True, line_kws={"color":"red"})
plt.show()

In [40]:
sns.histplot(working_df['tagRatio'], kde=True, line_kws={"color":"red"})
plt.show()

In [41]:
sns.histplot(working_df['imageTagRatio'], kde=True, line_kws={"color":"red"})
plt.show()

In [42]:
sns.histplot(working_df['hyperlinkToAllWordsRatio'], kde=True, line_kws={"color":"red"})
plt.show()

In [43]:
working_df['hyperlinkToAllWordsRatio'] = np.log(10+working_df['hyperlinkToAllWordsRatio'])

In [44]:
sns.histplot(working_df['hyperlinkToAllWordsRatio'], kde=True, line_kws={"color":"red"})
plt.show()

In [45]:
sns.histplot(working_df['alphanumCharCount'], kde=True, line_kws={"color":"red"})
plt.show()

In [46]:
sns.histplot(working_df['linksCount'], kde=True, line_kws={"color":"red"})
plt.show()

In [47]:
sns.histplot(working_df['wordCount'], kde=True, line_kws={"color":"red"})
plt.show()

In [48]:
working_df['wordCount'] = np.log(2+working_df['wordCount'])

In [49]:
sns.histplot(working_df['wordCount'], kde=True, line_kws={"color":"red"})
plt.show()

In [50]:
sns.histplot(working_df['parametrizedLinkRatio'], kde=True, line_kws={"color":"red"})
plt.show()

In [51]:
working_df['parametrizedLinkRatio'] = np.log(0.07+working_df['parametrizedLinkRatio'])

In [52]:
sns.histplot(working_df['parametrizedLinkRatio'], kde=True, line_kws={"color":"red"})
plt.show()

In [53]:
sns.histplot(working_df['spellingErrorsRatio'], kde=True, line_kws={"color":"red"})
plt.show()

Outliers Removal

In [54]:
class OutlierRemoval: 
    def __init__(self, lower_quartile, upper_quartile):
        self.lower_whisker = lower_quartile - 1.5*(upper_quartile - lower_quartile)
        self.upper_whisker = upper_quartile + 1.5*(upper_quartile - lower_quartile)
    def removeOutlier(self, x):
        return (x if x <= self.upper_whisker and x >= self.lower_whisker else (self.lower_whisker if x < self.lower_whisker else (self.upper_whisker)))

In [55]:
sns.boxplot(x=working_df['alchemy_category_score'])
plt.show()

In [56]:
sns.boxplot(x=working_df['avgLinkWordLength'])
plt.show()

In [57]:
feature = working_df['avgLinkWordLength']
feature_outlier_remover = OutlierRemoval(feature.quantile(0.25), feature.quantile(0.75))
outlier_removed_feature = feature.apply(feature_outlier_remover.removeOutlier)
working_df['avgLinkWordLength'] = outlier_removed_feature

In [58]:
sns.boxplot(x=working_df['AvglinkWithOneCommonWord'])
plt.show()

In [59]:
sns.boxplot(x=working_df['AvglinkWithTwoCommonWord'])
plt.show()

In [60]:
sns.boxplot(x=working_df['AvglinkWithThreeCommonWord'])
plt.show()

In [61]:
sns.boxplot(x=working_df['AvglinkWithFourCommonWord'])
plt.show()

In [62]:
sns.boxplot(x=working_df['redundancyMeasure'])
plt.show()

In [63]:
feature = working_df['redundancyMeasure']
feature_outlier_remover = OutlierRemoval(feature.quantile(0.25), feature.quantile(0.75))
outlier_removed_feature = feature.apply(feature_outlier_remover.removeOutlier)
working_df['redundancyMeasure'] = outlier_removed_feature

In [64]:
sns.boxplot(x=working_df['frameTagRatio'])
plt.show()

In [65]:
feature = working_df['frameTagRatio']
feature_outlier_remover = OutlierRemoval(feature.quantile(0.25), feature.quantile(0.75))
outlier_removed_feature = feature.apply(feature_outlier_remover.removeOutlier)
working_df['frameTagRatio'] = outlier_removed_feature

In [66]:
sns.boxplot(x=working_df['tagRatio'])
plt.show()

In [67]:
feature = working_df['tagRatio']
feature_outlier_remover = OutlierRemoval(feature.quantile(0.25), feature.quantile(0.75))
outlier_removed_feature = feature.apply(feature_outlier_remover.removeOutlier)
working_df['tagRatio'] = outlier_removed_feature

In [68]:
sns.boxplot(x=working_df['imageTagRatio'])
plt.show()

In [69]:
feature = working_df['imageTagRatio']
feature_outlier_remover = OutlierRemoval(feature.quantile(0.25), feature.quantile(0.75))
outlier_removed_feature = feature.apply(feature_outlier_remover.removeOutlier)
working_df['imageTagRatio'] = outlier_removed_feature

In [70]:
sns.boxplot(x=working_df['hyperlinkToAllWordsRatio'])
plt.show()

In [71]:
sns.boxplot(x=working_df['alphanumCharCount'])
plt.show()

In [72]:
feature = working_df['alphanumCharCount']
feature_outlier_remover = OutlierRemoval(feature.quantile(0.25), feature.quantile(0.75))
outlier_removed_feature = feature.apply(feature_outlier_remover.removeOutlier)
working_df['alphanumCharCount'] = outlier_removed_feature

In [73]:
sns.boxplot(x=working_df['linksCount'])
plt.show()

In [74]:
feature = working_df['linksCount']
feature_outlier_remover = OutlierRemoval(feature.quantile(0.25), feature.quantile(0.75))
outlier_removed_feature = feature.apply(feature_outlier_remover.removeOutlier)
working_df['linksCount'] = outlier_removed_feature

In [75]:
sns.boxplot(x=working_df['wordCount'])
plt.show()

In [76]:
feature = working_df['wordCount']
feature_outlier_remover = OutlierRemoval(feature.quantile(0.25), feature.quantile(0.75))
outlier_removed_feature = feature.apply(feature_outlier_remover.removeOutlier)
working_df['wordCount'] = outlier_removed_feature

In [77]:
sns.boxplot(x=working_df['parametrizedLinkRatio'])
plt.show()

In [78]:
sns.boxplot(x=working_df['spellingErrorsRatio'])
plt.show()

In [79]:
feature = working_df['spellingErrorsRatio']
feature_outlier_remover = OutlierRemoval(feature.quantile(0.25), feature.quantile(0.75))
outlier_removed_feature = feature.apply(feature_outlier_remover.removeOutlier)
working_df['spellingErrorsRatio'] = outlier_removed_feature

Categorical Columns

In [80]:
working_df['domainLink'].value_counts()

In [81]:
working_df.drop(axis = "columns",labels= ['domainLink'], inplace = True)

In [82]:
working_df['lengthyDomain'].value_counts()

In [83]:
working_df.head()

###Label Encoding

In [84]:
useful_working_df = working_df.drop(axis='columns',labels=['url','webpageDescription'])

In [85]:
useful_working_df = pd.get_dummies(useful_working_df, columns = ['alchemy_category'])

In [86]:
other_X = useful_working_df.drop(axis = "columns",labels="label").to_numpy().astype(np.float64)

In [87]:
scaler = StandardScaler()
other_X = scaler.fit_transform(other_X)

##Language Preprocessing

Parsing Given HTML Files 

In [88]:
count1 = 0
count2 = 0
html_content = []
for i in range(7395):

    try:
      with open('../input/html-files/html_content/'+str(i), 'r',encoding = 'utf-8') as file:
          data = file.read()
          html_content.append(data)
    except:
      with open('../input/html-files/html_content/'+str(i), 'r',encoding = 'iso-8859-1') as file:
          data = file.read()
          html_content.append(data)
          count2+=1

In [89]:
from bs4 import BeautifulSoup
from unidecode import unidecode
import json, re

In [90]:
def boil_soup(urlid, parser="lxml"):
    
    html = html_content[urlid]

    for parser in ["lxml", "xml", "html5lib"]:
        soup = BeautifulSoup(html, parser)
        if soup.body:
            return soup

    return BeautifulSoup(html)

In [91]:
def clean_string(s):
    st = str(s)
    st = unidecode(st).lower()
    st = re.sub(r"\s+", ' ', st)
    return st.strip()

In [92]:
# html tags of interest
TAGS = ['title', 'h1', 'h2', 'h3', 'meta-description', 'meta-keywords','img', 'a', 'other']

html_text_list = []
for i in range(working_df.shape[0]):

    data = {}
    urlid = int(working_df['id'][i])
    
    #parse html
    soup = boil_soup(urlid)
    
    # remove non-text tags
    for tag in ['script', 'style']:
            for el in soup.find_all(tag):
                el.extract()
    
    # extract text for each tag
    for tag in TAGS:
            items = []
            for el in soup.find_all(tag):
                el.extract()

                if tag == 'img':
                    try:
                        items.append(el['alt'])
                    except KeyError:
                        pass
                    try:
                        items.append(el['title'])
                    except KeyError:
                        pass
                else:
                    items.append(el.text)

            data[tag] = items
            
    # extract meta tags
    meta = soup.find_all('meta')
    for el in meta:
        prop = el.get('property') if el.get('property') else el.get('name')
        if not prop:
            continue
        prop = prop.lower()
        try:
            s = unicode(el['content'])
        except:
            continue

        data['meta-'+prop] = s.split(u',') if prop == 'keywords' else [s]
        
    for item in data:
        data[item] = clean_string(data[item])
            
    html_text_list.append(data)

In [93]:
for i in range(len(html_text_list)):
    content = ""
    for tag in TAGS:
        content = content + " " + html_text_list[i][tag]
    html_text_list[i] = content

Extracting Body From json text


In [94]:
web_body_list = []
web_title_list = []
web_url_list = []
for i in range(working_df.shape[0]):
  dictionary_json =   json.loads(working_df['webpageDescription'][i])
  if('body' in dictionary_json):
    if(dictionary_json['body'] is None):
      web_body_list.append(" ")
    else:
      web_body_list.append(dictionary_json['body'])
  else:
    web_body_list.append(" ")

  if('title' in dictionary_json):
    if(dictionary_json['title'] is None):
      web_title_list.append(" ")
    else:
      web_title_list.append(dictionary_json['title'])
    
  else:
    web_title_list.append(" ")

  if('url' in dictionary_json):
    if(dictionary_json['url'] is None):
      web_url_list.append(" ")
    else:
      web_url_list.append(dictionary_json['url'])
  else:
    web_url_list.append(" ")
   

In [95]:
web_text_list = []
for i in range(working_df.shape[0]):
  web_text = web_body_list[i] + " " + web_title_list[i] + " " + web_url_list[i]
  web_text_list.append(web_text)

##Natural Language Processing Techniques on Body Text

In [96]:
import re
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

In [97]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

Lemmatization or Stemming

In [98]:
ps = PorterStemmer() #For Stemming
wordnets = WordNetLemmatizer() #For Lemmatization

In [99]:
corpus1 = []
corpus2 = []
for i in range(working_df.shape[0]):

  #Removing Special Characters and Numbers
  review1 = re.sub('[^a-zA-Z]', ' ', web_text_list[i])
  review2 = re.sub('[^a-zA-Z]', ' ', html_text_list[i])

  review1 = review1.lower()
  review2 = review2.lower()

  review1 = review1.split()
  review2 = review2.split()

  #Lemmatization
  review1 = [wordnets.lemmatize(word) for word in review1 if not word in stopwords.words('english')]
  review2 = [wordnets.lemmatize(word) for word in review2 if not word in stopwords.words('english')]

  #Stemming
  #review = [ps.stem(word) for word in review if not word in stopwords.words('english')]

  review1 = ' '.join(review1)
  review2 = ' '.join(review2)

  corpus1.append(review1)
  corpus2.append(review2)

Bag of Words Approach

In [100]:
# from sklearn.feature_extraction.text import CountVectorizer

In [101]:
# cv = CountVectorizer()

TF-IDF Approach

for corpus 1

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [103]:
tv1 = TfidfVectorizer()

In [104]:
nlp_X_1 = tv1.fit_transform(corpus1).toarray()

Scaling

In [105]:
scaler1 = StandardScaler()

In [106]:
nlp_X_1 = scaler1.fit_transform(nlp_X_1)

for corpus 2

In [107]:
tv2 = TfidfVectorizer()

In [108]:
nlp_X_2 = tv2.fit_transform(corpus2).toarray()

Scaling

In [109]:
scaler2 = StandardScaler()

In [110]:
nlp_X_2 = scaler2.fit_transform(nlp_X_2)

WORD2VEC

In [111]:
# from gensim.models import Word2Vec

# #continuous bag-of-words
# model1 = Word2Vec(sentences = corpus, min_count = 1, size = 100, window = 5)

# #skip gram
# #model2 = Word2Vec(sentences = corpus, min_count = 1, size = 100, window = 5, sg = 1)

# #change to select type of Word2Vec
# #CBOW is computationally less expensive than skip gram and gives similar results
# Model = model1

# documents = []
# for document in corpus:
#     for word in : 
#         word_vectors.append(Model.wv[word])
#     documents.append(np.concatenate(word_vectors))

# document_matrix = np.concatenate(documents)

# nlp_X = document_matrix

##Dimensionality Reduction Using PCA

##Dimensionality Reduction Using Truncated SVD

In [112]:
from sklearn.decomposition import TruncatedSVD

In [113]:
svd1 = TruncatedSVD(n_components=120)
svd2 = TruncatedSVD(n_components=120)

In [114]:
nlp_X_1 = svd1.fit_transform(nlp_X_1)
nlp_X_2 = svd2.fit_transform(nlp_X_2)

##Splitting the Data

In [115]:
working_df.drop(axis = "columns",labels= ['id'], inplace = True)

In [116]:
train_Y = working_df["label"].to_numpy()

##Modeling

In [117]:
# from sklearn.linear_model import SGDClassifier
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from sklearn.calibration import CalibratedClassifierCV
# from sklearn.svm import SVC
# from sklearn.neighbors import NearestNeighbors

In [118]:
# scores = cross_val_score(LogisticRegression(),nlp_X_1,train_Y, scoring = 'roc_auc')
# print(scores)
# print(np.mean(scores))

Ensemble Technique

In [119]:
model1 = LogisticRegression(solver = 'liblinear')

In [120]:
model1.fit(nlp_X_1,train_Y)

In [121]:
model2 = LogisticRegression(solver = 'liblinear')

In [122]:
model2.fit(nlp_X_2,train_Y)

In [123]:
model3 = LogisticRegression(solver = 'liblinear')

In [124]:
model3.fit(other_X,train_Y)

LogisticRegression

In [125]:
# log_reg_model = LogisticRegression(solver='liblinear')

In [126]:
# log_reg_model.fit(nlp_X_2,train_Y)

In [127]:
# pred_log_reg =log_reg_model.predict_proba(nlp_X_2).T[1]

In [128]:
# roc_auc_score(train_Y,pred_log_reg)

Multinomial Naive Bayes

In [129]:
# multinomialNB_model = MultinomialNB(alpha = 0.45)

In [130]:
# multinomialNB_model.fit(nlp_X,train_Y)

In [131]:
# pred_multiNB = multinomialNB_model.predict_proba(nlp_X).T[1]

In [132]:
# roc_auc_score(train_Y,pred_multiNB)

SGD Classifier

In [133]:
# sgd_model = SGDClassifier(loss = 'log', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)

In [134]:
# sgd_model.fit(final_X,train_Y)

In [135]:
# pred_sgd = sgd_model.predict_proba(final_X).T[1]

In [136]:
# roc_auc_score(train_Y,pred_sgd)

Random Forest

In [137]:
# random_forest_model = RandomForestClassifier()

In [138]:
# random_forest_model.fit(nlp_X,train_Y)

In [139]:
# pred_rf = random_forest_model.predict_proba(nlp_X).T[1]

In [140]:
# roc_auc_score(train_Y,pred_rf)

Calibrated SGD Classifier

In [141]:
# base_clf = SGDClassifier(loss = 'log', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)

In [142]:
# calibrated_clf = CalibratedClassifierCV(base_estimator=base_clf, cv=3, method = 'isotonic')

In [143]:
# calibrated_clf.fit(nlp_X, train_Y)

In [144]:
# pred_calibrated_clf = calibrated_clf.predict_proba(nlp_X).T[1]

In [145]:
# roc_auc_score(train_Y,pred_calibrated_clf)

SVM Kernel = linear

In [146]:
# svm_model = SVC(kernel = 'rbf',probability=True, C = 0.2)

In [147]:
# svm_model.fit(nlp_X,train_Y)

In [148]:
# pred_svm = svm_model.predict_proba(nlp_X).T[1]

In [149]:
# roc_auc_score(train_Y,pred_svm)

XG Boost Classifier

In [150]:
# xgb_model = XGBClassifier()

In [151]:
# xgb_model.fit(nlp_X,train_Y)

In [152]:
# pred_xgb = xgb_model.predict_proba(final_X).T[1]

In [153]:
# roc_auc_score(train_Y,pred_xgb)

##Testing

In [154]:
main_test_df.isna().sum()

In [155]:
(main_test_df=="?").sum()

###NLP on Test Set

In [156]:
# html tags of interest
TAGS = ['title', 'h1', 'h2', 'h3', 'meta-description', 'meta-keywords','img', 'a', 'other']

test_html_text_list = []
for i in range(main_test_df.shape[0]):

    data = {}
    urlid = int(main_test_df['id'][i])
    
    #parse html
    soup = boil_soup(urlid)
    
    # remove non-text tags
    for tag in ['script', 'style']:
            for el in soup.find_all(tag):
                el.extract()
    
    # extract text for each tag
    for tag in TAGS:
            items = []
            for el in soup.find_all(tag):
                el.extract()

                if tag == 'img':
                    try:
                        items.append(el['alt'])
                    except KeyError:
                        pass
                    try:
                        items.append(el['title'])
                    except KeyError:
                        pass
                else:
                    items.append(el.text)

            data[tag] = items
            
    # extract meta tags
    meta = soup.find_all('meta')
    for el in meta:
        prop = el.get('property') if el.get('property') else el.get('name')
        if not prop:
            continue
        prop = prop.lower()
        try:
            s = unicode(el['content'])
        except:
            continue

        data['meta-'+prop] = s.split(u',') if prop == 'keywords' else [s]
    
    for item in data:
        data[item] = clean_string(data[item])
    
    test_html_text_list.append(data)

In [157]:
for i in range(len(test_html_text_list)):
    content = ""
    for tag in TAGS:
        content = content + " " + test_html_text_list[i][tag]
    test_html_text_list[i] = content

In [158]:
test_web_body_list = []
test_web_title_list = []
test_web_url_list = []
for i in range(main_test_df.shape[0]):
  dictionary_json =   json.loads(main_test_df['webpageDescription'][i])
  if('body' in dictionary_json):
    if(dictionary_json['body'] is None):
      test_web_body_list.append(" ")
    else:
      test_web_body_list.append(dictionary_json['body'])
  else:
    test_web_body_list.append(" ")

  if('title' in dictionary_json):
    if(dictionary_json['title'] is None):
      test_web_title_list.append(" ")
    else:
      test_web_title_list.append(dictionary_json['title'])
    
  else:
   test_web_title_list.append(" ")

  if('url' in dictionary_json):
    if(dictionary_json['url'] is None):
      test_web_url_list.append(" ")
    else:
      test_web_url_list.append(dictionary_json['url'])
  else:
    test_web_url_list.append(" ")

In [159]:
test_web_text_list = []
for i in range(main_test_df.shape[0]):
  test_web_text = test_web_body_list[i] + " " + test_web_title_list[i] + " " + test_web_url_list[i]
  test_web_text_list.append(test_web_text)

In [160]:
corpus_test1 = []
corpus_test2 = []
for i in range(main_test_df.shape[0]):
  review1 = re.sub('[^a-zA-Z]', ' ', test_web_text_list[i])
  review2 = re.sub('[^a-zA-Z]', ' ', test_html_text_list[i])

  review1 = review1.lower()
  review2 = review2.lower()

  review1 = review1.split()
  review2 = review2.split()

  review1 = [wordnets.lemmatize(word) for word in review1 if not word in stopwords.words('english')]
  review2 = [wordnets.lemmatize(word) for word in review2 if not word in stopwords.words('english')]

  review1 = ' '.join(review1)
  review2 = ' '.join(review2)

  corpus_test1.append(review1)
  corpus_test2.append(review2)

In [161]:
nlp_test_X_1 = tv1.transform(corpus_test1).toarray()
nlp_test_X_2 = tv2.transform(corpus_test2).toarray()

In [162]:
nlp_test_X_1 = scaler1.transform(nlp_test_X_1)
nlp_test_X_2 = scaler2.transform(nlp_test_X_2)

In [163]:
nlp_test_X_1 = svd1.transform(nlp_test_X_1)
nlp_test_X_2 = svd2.transform(nlp_test_X_2)

In [164]:
final_Ytest_1 = model1.predict_proba(nlp_test_X_1).T[1]
final_Ytest_2 = model2.predict_proba(nlp_test_X_2).T[1]
# final_Ytest_3 = model3.predict_proba(nlp_test_X_2)

In [165]:
final_Ytest = (2*final_Ytest_1 + final_Ytest_2)/3

In [166]:
final_Ytest.sum()

##Preparation For Submission

In [167]:
submission = pd.read_csv("../input/ml-project-dataset/sample_submission.csv")

In [168]:
submission.head()

In [169]:
submission["label"] = final_Ytest

In [170]:
submission.head()

In [171]:
submission.to_csv("./submission2.csv",index = False)