In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [2]:
dataset = pd.read_csv('data/anes_dataset.csv')

In [3]:
print(len(dataset))
print(dataset.columns.values)

944
['popul' 'TVnews' 'selfLR' 'ClinLR' 'DoleLR' 'PID' 'age' 'educ' 'income'
 'vote']


In [4]:
dataset.describe()

Unnamed: 0,popul,TVnews,selfLR,ClinLR,DoleLR,PID,age,educ,income,vote
count,944.0,944.0,944.0,944.0,944.0,944.0,944.0,944.0,944.0,944.0
mean,306.381356,3.727754,4.325212,2.939619,5.394068,2.842161,47.043432,4.565678,16.331568,0.416314
std,1082.606745,2.677235,1.438436,1.383725,1.269349,2.273337,16.42313,1.599287,5.974781,0.493208
min,0.0,0.0,1.0,1.0,1.0,0.0,19.0,1.0,1.0,0.0
25%,1.0,1.0,3.0,2.0,5.0,1.0,34.0,3.0,14.0,0.0
50%,22.0,3.0,4.0,3.0,6.0,2.0,44.0,4.0,17.0,0.0
75%,110.0,7.0,6.0,4.0,6.0,5.0,58.0,6.0,21.0,1.0
max,7300.0,7.0,7.0,7.0,7.0,6.0,91.0,7.0,24.0,1.0


In [5]:
dataset.head()

Unnamed: 0,popul,TVnews,selfLR,ClinLR,DoleLR,PID,age,educ,income,vote
0,0,7,7,1,6,6,36,3,1,1
1,190,1,3,3,5,1,20,4,1,0
2,31,7,2,2,6,1,24,6,1,0
3,83,4,3,4,5,1,28,6,1,0
4,640,7,5,6,4,0,68,6,1,0


# Split the dataset

In [8]:
header = list(dataset.columns.values)
features = header[:-1]
target = header[-1]

print("Features :",features)
print("Target :",target)

x_train, x_test, y_train, y_test = train_test_split(dataset[features], dataset[target], test_size = 0.4)

print("x_train Shape",x_train.shape)
print("x_test Shape",x_test.shape)
print("y_train Shape",y_train.shape)
print("y_test Shape",y_test.shape)

Features : ['popul', 'TVnews', 'selfLR', 'ClinLR', 'DoleLR', 'PID', 'age', 'educ', 'income']
Target : vote
x_train Shape (566, 9)
x_test Shape (378, 9)
y_train Shape (566,)
y_test Shape (378,)


In [9]:
# Model with 4 features
features_4 = ['TVnews','age','educ','income']
model_with_4_features = LogisticRegression()
model_with_4_features.fit(x_train[features_4], y_train)

train_accuracy = model_with_4_features.score(x_train[features_4], y_train)
print(train_accuracy)

0.600706713781


In [10]:
# Model with 4 features
model_with_all_features = LogisticRegression()
model_with_all_features.fit(x_train, y_train)

train_accuracy = model_with_all_features.score(x_train, y_train)
print(train_accuracy)

0.904593639576


# Prediction with Logistic Regression

In [11]:
test_observation_for_4_features = x_test[features_4][:1]
print(test_observation_for_4_features)

test_observation_for_all_features = x_test[:1]
print(test_observation_for_all_features)

     TVnews  age  educ  income
683       5   59     5      21
     popul  TVnews  selfLR  ClinLR  DoleLR  PID  age  educ  income
683    100       5       6       2       6    6   59     5      21


In [12]:
print(model_with_4_features.predict(test_observation_for_4_features))
print(model_with_all_features.predict(test_observation_for_all_features))

[1]
[1]


# Checking Accuracy Score

In [13]:
model_with_4_features_pred = model_with_4_features.predict(x_test[features_4])
model_with_4_features_pred_accuracy = metrics.accuracy_score(y_test, model_with_4_features_pred)

model_with_all_features_pred = model_with_all_features.predict(x_test)
model_with_all_features_pred_accuracy = metrics.accuracy_score(y_test, model_with_all_features_pred)

print(model_with_4_features_pred_accuracy)
print(model_with_all_features_pred_accuracy)

0.592592592593
0.910052910053
