In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
import seaborn as sns # More snazzy plotting library
import itertools



In [2]:
pd.options.display.max_columns = 40

In [3]:
#open the file
file_path = "/home/user/projects/data_mining/predictive_data_mining/logistic_regression/working_dataset.csv"
dta = pd.read_csv(file_path)
dta.gross.median()

FileNotFoundError: File b'/home/user/projects/data_mining/predictive_data_mining/logistic_regression/working_dataset.csv' does not exist

In [None]:
dta = dta.reindex_axis(sorted(dta.columns), axis=1)
dta.head(5)

In [None]:
#clean up data non numeric rows
str_list = [] # empty list to contain columns with strings (words)
for colname, colvalue in dta.iteritems():
    if type(colvalue[1]) == str:
        #if colname not in str_list:
            str_list.append(colname)
# Get to the numeric columns by inversion
num_list = dta.columns.difference(str_list)
#USe only the numeriv values
dta_clean = dta[num_list]
#remove the null values, that is fill NaN with there - FIXME: Rihards, naive implementation
dta_clean = dta_clean.fillna(value=0, axis=1)
dta_clean = dta_clean.reindex_axis(sorted(dta_clean.columns), axis=1)
dta_clean.head(10)

In [None]:
#clean up data from zero rows 
for colname, colvalue in dta_clean.iteritems():
    if colname != 'facenumber_in_poster':
        dta_clean = dta_clean[dta_clean[colname] != 0]
dta_clean.head(10)

In [None]:
#add age column
dta_clean['age'] = 2017 - dta_clean['title_year']
#add new binary column which represents if gross is above the median gross or below
dta_clean['gross_median'] = (dta_clean.gross >= dta_clean.gross.median()).astype(int)
dta_clean.groupby('gross_median').head()

In [None]:
dta_clean.groupby('gross_median').mean()

In [None]:
# show plots in the notebook
%matplotlib inline

In [None]:
# histogram of gross
dta_clean.gross_class.hist()
plt.title('Histogram of Gross')
plt.xlabel('Gross')
plt.ylabel('Frequency')

In [None]:
# histogram of gross median
dta_clean.gross.hist()
plt.title('Histogram of Gross Median')
plt.xlabel('Gross Median')
plt.ylabel('Frequency')

In [None]:
#plot pearsons coleration
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(8, 8))
plt.title('Pearson Correlation of Movie Features')
# Draw the heatmap using seaborn
sns.heatmap(dta_clean.astype(float).corr(),linewidths=0.25,vmax=1.0, square=True, cmap="YlGnBu", linecolor='black')

In [None]:
# create dataframes suitable for regression, with seperating predictors and outcomes. the predictors can be wrapped with a matching function in order to improve coleration linearity
y, X = dmatrices('gross_median ~  budget + cast_total_facebook_likes + \
                 director_facebook_likes + duration + facenumber_in_poster + imdb_score + \
                 movie_facebook_likes + num_critic_for_reviews + num_user_for_reviews + num_voted_users + title_year',
                 dta_clean, return_type="dataframe")
X.head()

In [None]:
# flatten y into a 1-D array
y = np.ravel(y)

In [None]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y)

# check the accuracy on the training set
model.score(X, y)

In [None]:
# examine the coefficients
pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))

In [None]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)
print X_train
print y_train

In [None]:
# predict class labels for the test set
predicted = model2.predict(X_test)
print predicted

In [None]:
# generate class probabilities
probs = model2.predict_proba(X_test)
print probs

In [None]:
# generate evaluation metrics
print metrics.accuracy_score(y_test, predicted)
print metrics.roc_auc_score(y_test, probs[:, 1])

In [None]:
fpr, tpr, threshold = metrics.roc_curve(y_test, probs[:, 1])
roc_auc = metrics.auc(fpr, tpr)
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")

In [None]:
# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print scores
print scores.mean()

In [None]:
X.head()

In [None]:
# retry one class with normalised data

#scale columns 
#min_max_scaler = preprocessing.MinMaxScaler()
#for colname, colvalue in X.iteritems():
 #       X[colname] = min_max_scaler.fit_transform(X[colname])
for colname, colvalue in X.iteritems():
        nomalizer_scaler = preprocessing.Normalizer().fit(X[colname])
        X[colname] = nomalizer_scaler.transform(X[colname])[0] 



#separate test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2_norm = LogisticRegression()
model2_norm.fit(X_train, y_train)

# predict class labels for the test set
predicted_norm = model2_norm.predict(X_test)
probs_norm = model2.predict_proba(X_test)

# generate evaluation metrics
#print metrics.accuracy_score(y_test, predicted_norm)
#print metrics.roc_auc_score(y_test, probs_norm[:, 1])


# evaluate the model using 10-fold cross-validation
scores = cross_val_score(model2_norm, X, y, scoring='accuracy', cv=10)
#print scores
#print scores.mean()
X.head()

In [None]:
#remove gross median column
#dta_clean = dta_clean.drop('gross_median', 1)

def label_gross (gross):
    if (gross < 1000000) : return 1
    elif ((gross >= 1000000) & (gross < 10000000)) : return 2
    elif ((gross >= 10000000) & (gross < 50000000)) : return 3
    elif ((gross >= 50000000) & (gross < 200000000)) : return 4
    elif (gross >= 200000000) : return 5


dta_clean['gross_class'] = dta_clean.gross.apply (lambda gross: label_gross (gross))

#add new classes for median ranges - below 1 million, 1 to 10 million, 10 to 50 million, 50 to 200 million, over 200 million
#dta_clean['gross_class_b1'] = (dta_clean.gross < 1000000).astype(int)
#dta_clean['gross_class_1to10'] = ((dta_clean.gross >= 1000000) & (dta_clean.gross < 10000000)).astype(int)
#dta_clean['gross_class_10to50'] = ((dta_clean.gross >= 10000000) & (dta_clean.gross < 50000000)).astype(int)
#dta_clean['gross_class_50to200'] = ((dta_clean.gross >= 50000000) & (dta_clean.gross < 200000000)).astype(int)
#dta_clean['gross_class_ov200'] = (dta_clean.gross >= 200000000).astype(int)
dta_clean.groupby('gross_class').mean()

In [None]:
# histogram of gross with a logorithmic scale
dta_clean.gross[dta_clean.gross_class	 == 1].hist(label=['bl1'])
dta_clean.gross[dta_clean.gross_class	 == 2].hist(label=['f1to10'])
dta_clean.gross[dta_clean.gross_class	 == 3].hist(label=['f10to50'])
dta_clean.gross[dta_clean.gross_class	 == 4].hist(label=['f50to200'])
dta_clean.gross[dta_clean.gross_class	 == 5].hist(label=['ov200'])
plt.title('Histogram of Gross Classes')
plt.xlabel('Gross')
plt.ylabel('Frequency')
plt.gca().set_xscale("log")
plt.legend(loc="upper left")

In [None]:
#dta_clean.boxplot(by='gross_class')

In [None]:
# create dataframes suitable for regression, with seperating predictors and outcomes. the predictors can be wrapped with a matching function in order to improve coleration linearity
y, X = dmatrices('gross_class ~ actor_1_facebook_likes + actor_2_facebook_likes + \
                 actor_3_facebook_likes + aspect_ratio + budget + cast_total_facebook_likes + \
                 director_facebook_likes + duration + facenumber_in_poster + imdb_score + \
                 movie_facebook_likes + num_critic_for_reviews + num_user_for_reviews + num_voted_users + title_year',
                 dta_clean, return_type="dataframe")


In [None]:
# flatten y and x into a 1-D array
y = np.ravel(y)

In [None]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y)

# check the accuracy on the training set
model.score(X, y)

In [None]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)

In [None]:
# predict class labels for the test set
predicted = model2.predict(X_test)
print predicted

In [None]:
# generate class probabilities
probs = model2.predict_proba(X_test)
print probs

In [None]:
# generate evaluation metrics
print metrics.accuracy_score(y_test, predicted)


In [None]:


## retry one class with normalised data
#normalize data 
#x_arr = X.values #returns a numpy array
#min_max_scaler = preprocessing.MinMaxScaler()
#x_scaled = min_max_scaler.fit_transform(x_arr)
#X = pd.DataFrame(x_scaled)

#separate test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#try different solvers and normollised data
model3 = LogisticRegression(solver = "newton-cg")
model3.fit(X_train, y_train)
predicted3 = model3.predict(X_test)

model4 = LogisticRegression(solver = "lbfgs")
model4.fit(X_train, y_train)
predicted4 = model4.predict(X_test)

model5 = LogisticRegression(solver = "liblinear")
model5.fit(X_train, y_train)
predicted5 = model5.predict(X_test)

model6 = LogisticRegression(solver = "sag")
model6.fit(X_train, y_train)
predicted6 = model6.predict(X_test)

print "newton-cg solver accuracy"
print metrics.accuracy_score(y_test, predicted3)

print "lbfgs solver accuracy"
print metrics.accuracy_score(y_test, predicted4)

print "nliblinear solver accuracy"
print metrics.accuracy_score(y_test, predicted5)

print "sag solver accuracy"
print metrics.accuracy_score(y_test, predicted6)


In [None]:
scores = cross_val_score(model5, X, y, scoring='accuracy', cv=10)
print scores
print scores.mean()

In [None]:
headers = list(dta_clean.columns.values)
headers.remove('gross_class')
headers.remove('gross_median')
headers.remove('age')
headers.remove('gross')
print headers
iterator = 0
for L in range(0, len(headers)+1):
    for subset in itertools.combinations(headers, L):
        iterator = iterator + 1
print iterator