In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [142]:
df=pd.read_csv('OnlineNewsPopularity.csv') #reading data

In [143]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
url                               39644 non-null object
timedelta                         39644 non-null int64
 n_tokens_title                   39644 non-null int64
 n_tokens_content                 39644 non-null int64
 n_unique_tokens                  39644 non-null float64
 n_non_stop_words                 39644 non-null float64
 n_non_stop_unique_tokens         39644 non-null float64
 num_hrefs                        39644 non-null int64
 num_self_hrefs                   39644 non-null int64
 num_imgs                         39644 non-null int64
 num_videos                       39644 non-null int64
 average_token_length             39644 non-null float64
 num_keywords                     39644 non-null int64
 data_channel_is_lifestyle        39644 non-null int64
 data_channel_is_entertainment    39644 non-null int64
 data_channel_is_bus              39644 non-null int64
 d

# Task1 . Removing non predictive attributes and splitting data

In [144]:
shares=df['shares']
features=df.drop(['url','timedelta','shares'],axis=1)
features = (features - features.mean()) / (features.max() - features.min()) #Normalisation of data

In [145]:
# Splitting data into test and train
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, shares, test_size=0.3, random_state=100)

# Task2: Linear Regression on test and train data

In [146]:
from sklearn.linear_model import LinearRegression
model1= LinearRegression()
model1.fit(X_train,y_train)
y_predict_train=model1.predict(X_train)
y_predict_test=model1.predict(X_test)


In [147]:
from sklearn.metrics import r2_score #performance metric

In [148]:
r2score_train=r2_score(y_train,y_predict_train)
print(r2score_train)

0.022708408923445167


In [149]:
r2score_test=r2_score(y_test,y_predict_test)
print(r2score_test)

0.01949173984149566


# Task3: Converting shares into a binary variable

In [150]:
#Creation of binary variable for shares based on median.
median_shares=np.median(shares)
shares_ind=[]
for share in shares:
    if share>= median_shares:
        shares_ind.append(1)
    else:
        shares_ind.append(0)
shares_binary=pd.DataFrame(shares_ind)

# Task4: Logistic Regressionon train and test data

In [151]:
#Splitting data into test and train
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(features, shares_binary, test_size=0.3, random_state=100)
model2= LogisticRegression()
model2.fit(X_train,y_train)
y_predict_train=model2.predict(X_train)
y_predict_test=model2.predict(X_test)


  y = column_or_1d(y, warn=True)


In [152]:
from sklearn.metrics import accuracy_score #Performance metric for logistic regression

In [153]:
accuracy_train=accuracy_score(y_train,y_predict_train)
print(accuracy_train)

0.646990990990991


In [154]:
accuracy_test=accuracy_score(y_test,y_predict_test)
print(accuracy_test)

0.646628552211199


# Experimentation

In [155]:
features['inter']=np.ones((df.shape[0]), dtype=np.int) #Creating a column with zeros to calculate intercept

In [156]:
#Cost function for gradient descent linear regression
def gradientDescent(x, y, theta, alpha, m, numIterations):
    xTrans = x.transpose()
    for i in range(0, numIterations):
        hypothesis = np.dot(x, theta)
        loss = hypothesis - y
        cost = np.sum(loss ** 2) / (2 * m)
        #print("Iteration %d | Cost: %f" % (i, cost))
        gradient = np.dot(xTrans, loss) / m
        theta = theta - alpha * gradient
    return theta

In [157]:
numIterations= 1000
alpha = 0.005
m, n = np.shape(features)
theta = np.ones(n)
theta = gradientDescent(features, shares, theta, alpha, m, numIterations)

In [158]:
y_pred=np.dot(features,theta.transpose())
score=r2_score(shares,y_pred)
score

0.00971327369544861

In [159]:
# Experimenting with the above cost function on train and test data with alpha=0.3,0.1,0.01,0.05

alpha_values=[1,0.3,0.1,0.05,0.01,0.005]
X_train, X_test, y_train, y_test = train_test_split(features, shares, test_size=0.3, random_state=100)
for alpha in alpha_values:
        m, n = np.shape(X_train)
        theta = np.ones(n)
        theta = gradientDescent(X_train, y_train, theta, alpha, m, 1000)
        y_pred_train=np.dot(X_train,theta.transpose())
        y_pred_test=np.dot(X_test,theta.transpose())
        score_train=r2_score(y_train,y_pred_train)
        score_test=r2_score(y_test,y_pred_test)
        print(alpha, score_train, score_test)

1 0.020576112248769762 0.017248173242646825
0.3 0.01878044744516394 0.016025412375853865
0.1 0.016479621826107294 0.014776966546349635
0.05 0.01510438252841606 0.013969497180605361
0.01 0.012023880651999885 0.011568591730339639
0.005 0.009630518886600825 0.009259297326538518


In [160]:
#Cost function for gradient descent logistic regression
def sigmoid(x):
    f = 1.0/(1.0 + np.exp(-x))
    return f
def logistic_reg(x, y, theta, alpha, m, Iterations):
    xTrans = x.transpose()
    for i in range(0, Iterations):
        hypothesis = sigmoid(np.dot(x, theta))
        cost = np.sum( -y[0]*hypothesis - (1-y[0])*(np.log(1-hypothesis)) )/m
        #print("Iteration %d | Cost: %f" % (i, cost))
        gradient = np.dot(xTrans,hypothesis-y[0]) / m
        theta = theta - alpha * gradient
    return theta

In [161]:
Iterations= 100
alpha = 0.03
m, n = np.shape(features)
theta = np.ones(n)
y=shares_binary
x=features
theta = logistic_reg(x, np.array(y), theta, alpha,m, Iterations)
theta

array([ 0.99608099,  0.99205797,  0.99998826,  0.9999703 ,  0.9999782 ,
        0.99488851,  0.99587325,  0.99410136,  0.99764963,  0.97832266,
        0.98402177,  0.98919725,  0.98659852,  0.9896444 ,  0.99283891,
        0.97416342,  1.03014262,  1.01442153,  0.99956805,  0.99967678,
        0.99382137,  0.96760858,  0.98761348,  0.93864026,  0.99874416,
        0.99651687,  0.99898508,  0.99622764,  0.99804989,  1.01372188,
        1.01229212,  1.01611379,  1.01406533,  1.0120251 ,  0.96767886,
        0.96410292,  0.93178178,  0.98882778,  1.00329461,  1.02157991,
        1.00908738,  0.97737844,  0.97213602,  0.97449017,  0.97380187,
        0.99991616,  0.95070344,  1.01072742,  0.96833179,  0.99492168,
        0.92129353,  0.99070172,  0.98746882,  0.99247906,  0.93895079,
        0.9781515 ,  0.9910545 ,  0.95246172, -0.58087019])

In [164]:
y_pred=np.dot(features,theta.transpose())
y_predicted=[]
for i in y_pred:
    value=sigmoid(i)
    if value>=0.5:
        y_predicted.append(1)
    else:
        y_predicted.append(0)
y_predicted=np.array(y_predicted)
score=accuracy_score(shares_binary,y_predicted)
score

0.536096256684492

In [165]:
## Experimenting with the above cost function on train and test data with alpha=0.3,0.1,0.01,0.05
alpha_values=[1,0.3,0.1,0.05,0.01,0.005]
X_train, X_test, y_train, y_test = train_test_split(features, shares_binary, test_size=0.3, random_state=25)
for alpha in alpha_values:
        m, n = np.shape(X_train)
        theta = np.ones(n)
        theta = logistic_reg(X_train, np.array(y_train), theta, alpha,m, 1000)
        y_pred_train=np.dot(X_train,theta.transpose())
        y_pred_test=np.dot(X_test,theta.transpose())
        y_predicted_train=[]
        y_predicted_test=[]
        for i in y_pred_train:
            value=sigmoid(i)
            if value>=0.5:
                y_predicted_train.append(1)
            else:
                y_predicted_train.append(0)
        for i in y_pred_test:
            value=sigmoid(i)
            if value>=0.5:
                y_predicted_test.append(1)
            else:
                y_predicted_test.append(0)
        y_predicted_train=np.array(y_predicted_train)
        y_predicted_test=np.array(y_predicted_test)
        score_train=accuracy_score(y_train,y_predicted_train)
        score_test=accuracy_score(y_test,y_predicted_test)
        print(alpha, score_train, score_test)

1 0.46645045045045047 0.46628552211198926
0.3 0.46645045045045047 0.46628552211198926
0.1 0.46645045045045047 0.46628552211198926
0.05 0.4664864864864865 0.46628552211198926
0.01 0.47549549549549547 0.4750294266016479
0.005 0.5033513513513513 0.5011770640659156


In [200]:
# Experiment2: Selecting 10 random columns and running the model(selecting the first 10 columns)
random_slice= features.iloc[:,[2,44,51,10,9,5,18,33,27,40]]
random_slice['inter']=np.ones((df.shape[0]), dtype=np.int) #Creating a column with zeros to calculate intercept
#Based on the above experimentation the best learning rate is 0.3 for linear regression and 0.005 for logistic
#Running linear and logistic models based on this learning rate


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [201]:
#Linear model
alpha_values=[0.3]
X_train, X_test, y_train, y_test = train_test_split(random_slice, shares, test_size=0.3, random_state=100)
for alpha in alpha_values:
        m, n = np.shape(X_train)
        theta = np.ones(n)
        theta = gradientDescent(X_train, y_train, theta, alpha, m, 1000)
        y_pred_train=np.dot(X_train,theta.transpose())
        y_pred_test=np.dot(X_test,theta.transpose())
        score_train=r2_score(y_train,y_pred_train)
        score_test=r2_score(y_test,y_pred_test)
        print(alpha, score_train, score_test)

0.3 0.010242716276614994 0.010386279797535924


In [203]:
#Logistic Model
alpha_values=[0.005]
X_train, X_test, y_train, y_test = train_test_split(random_slice, shares_binary, test_size=0.3, random_state=25)
for alpha in alpha_values:
        m, n = np.shape(X_train)
        theta = np.ones(n)
        theta = logistic_reg(X_train, np.array(y_train), theta, alpha,m, 1000)
        y_pred_train=np.dot(X_train,theta.transpose())
        y_pred_test=np.dot(X_test,theta.transpose())
        y_predicted_train=[]
        y_predicted_test=[]
        for i in y_pred_train:
            value=sigmoid(i)
            if value>=0.5:
                y_predicted_train.append(1)
            else:
                y_predicted_train.append(0)
        for i in y_pred_test:
            value=sigmoid(i)
            if value>=0.5:
                y_predicted_test.append(1)
            else:
                y_predicted_test.append(0)
        y_predicted_train=np.array(y_predicted_train)
        y_predicted_test=np.array(y_predicted_test)
        score_train=accuracy_score(y_train,y_predicted_train)
        score_test=accuracy_score(y_test,y_predicted_test)
        print(alpha, score_train, score_test)

0.005 0.4720720720720721 0.4715823104086094


# Experimentaion 3

In [198]:
#Running regression model on 10 best chosen features
# Features selected are -  n_tokens_content,  num_imgs, weekday_is_friday,weekday_is_saturday, weekday_is_sunday, global_subjectivity
## global_rate_positive_words, title_sentiment_polarity, data_channel_is_socmed	, self_reference_max_shares
features.columns


Index([' n_tokens_title', ' n_tokens_content', ' n_unique_tokens',
       ' n_non_stop_words', ' n_non_stop_unique_tokens', ' num_hrefs',
       ' num_self_hrefs', ' num_imgs', ' num_videos', ' average_token_length',
       ' num_keywords', ' data_channel_is_lifestyle',
       ' data_channel_is_entertainment', ' data_channel_is_bus',
       ' data_channel_is_socmed', ' data_channel_is_tech',
       ' data_channel_is_world', ' kw_min_min', ' kw_max_min', ' kw_avg_min',
       ' kw_min_max', ' kw_max_max', ' kw_avg_max', ' kw_min_avg',
       ' kw_max_avg', ' kw_avg_avg', ' self_reference_min_shares',
       ' self_reference_max_shares', ' self_reference_avg_sharess',
       ' weekday_is_monday', ' weekday_is_tuesday', ' weekday_is_wednesday',
       ' weekday_is_thursday', ' weekday_is_friday', ' weekday_is_saturday',
       ' weekday_is_sunday', ' is_weekend', ' LDA_00', ' LDA_01', ' LDA_02',
       ' LDA_03', ' LDA_04', ' global_subjectivity',
       ' global_sentiment_polarity', ' gl

In [199]:
select_data=features.iloc[:,[1,7,33,34,35,42,44,55,14,27]]
select_data['inter']=np.ones((df.shape[0]), dtype=np.int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [194]:
#Linear model
alpha_values=[0.3]
X_train, X_test, y_train, y_test = train_test_split(select_data, shares, test_size=0.3, random_state=100)
for alpha in alpha_values:
        m, n = np.shape(X_train)
        theta = np.ones(n)
        theta = gradientDescent(X_train, y_train, theta, alpha, m, 1000)
        y_pred_train=np.dot(X_train,theta.transpose())
        y_pred_test=np.dot(X_test,theta.transpose())
        score_train=r2_score(y_train,y_pred_train)
        score_test=r2_score(y_test,y_pred_test)
        print(alpha, score_train, score_test)

0.3 0.004504066560499953 0.004128715437329067


In [202]:
#Logistic Model
alpha_values=[0.005]
X_train, X_test, y_train, y_test = train_test_split(select_data, shares_binary, test_size=0.3, random_state=25)
for alpha in alpha_values:
        m, n = np.shape(X_train)
        theta = np.ones(n)
        theta = logistic_reg(X_train, np.array(y_train), theta, alpha,m, 1000)
        y_pred_train=np.dot(X_train,theta.transpose())
        y_pred_test=np.dot(X_test,theta.transpose())
        y_predicted_train=[]
        y_predicted_test=[]
        for i in y_pred_train:
            value=sigmoid(i)
            if value>=0.5:
                y_predicted_train.append(1)
            else:
                y_predicted_train.append(0)
        for i in y_pred_test:
            value=sigmoid(i)
            if value>=0.5:
                y_predicted_test.append(1)
            else:
                y_predicted_test.append(0)
        y_predicted_train=np.array(y_predicted_train)
        y_predicted_test=np.array(y_predicted_test)
        score_train=accuracy_score(y_train,y_predicted_train)
        score_test=accuracy_score(y_test,y_predicted_test)
        print(alpha, score_train, score_test)

0.005 0.4803243243243243 0.4815873549688919
