In [4]:
import csv
from collections import defaultdict
import numpy as np
from sklearn import preprocessing

responseUserID = {}
userScore = {}
videoCounts = defaultdict(int)
videoCountsClassification = defaultdict(int)
screen_name_to_gender = defaultdict(int)
screen_name_to_year_of_birth = defaultdict(int)
screen_name_to_education = defaultdict(int)
screen_name_to_post_count = defaultdict(int)
screen_name_to_comment_count = defaultdict(int)
screen_name_to_nationality = {}

In [5]:
## Get ResponseID - UserID matching
with open('../../data/survey_post_EarthSciences_ResGeo202_Spring2015_respondent_metadata.csv', 'r') as csvfile :	
    lines = csv.reader(csvfile, delimiter = ',', quotechar = '"')
    
    for line in lines :
        if line[1] not in responseUserID:
            responseUserID[line[1]] = line[2]


# Get Survey Data, only get the ID that has a matching 
with open('../../data/survey_post_EarthSciences_ResGeo202_Spring2015_response.csv', 'r') as csvfile :	
    lines = csv.reader(csvfile, delimiter = ',', quotechar = '"')
    for line in lines :
        if (line[2] == "Q1.1" and line[4] != ''):
            if line[1] in responseUserID:
                score = int(line[4])
                if score < 6:
                    userScore[responseUserID[line[1]]] = 5
                else:
                    userScore[responseUserID[line[1]]] = int(line[4])
                

In [6]:
# extracting demographic feature
with open('../../data/EarthSciences_ResGeo202_Spring2015_demographics.csv', 'r') as csvfile :
	lines = csv.reader(csvfile, delimiter = ',', quotechar = '"')
	for line in lines :
		if line[0] in userScore : 

			# gender feature: blank -> 0, m -> -1, f -> +1
			if line[1] == "m" :
				screen_name_to_gender[line[0]] = -1
			if line[1] == "f" :
				screen_name_to_gender[line[0]] = 1
			if line[1] not in screen_name_to_gender and line[0] != "\\N" :
				screen_name_to_gender[line[0]] = 0

			# year_of_birth
			if line[2] != "\\N" :
				screen_name_to_year_of_birth[line[0]] = 2016 - int(line[2])

			# ?????????????????????????????????????????????????
			## should set it to 0 if birth year not provided???
			if line[2] == "\\N" and line[0] != "\\N":
				screen_name_to_year_of_birth[line[0]] = 0
			
			# level_of_education: 
			# Doctorate -> 7
			# Masters or professional degree -> 6
			# Bachelors -> 5
			# Associates -> 4
			# Secondary/High School -> 3
			# Junior secondary/junior high/middle School -> 2
			# Elementary/Primary School -> 1
			# None, Other, User withheld, Signup before level collected -> 0
			if line[3] == "Doctorate" :
				screen_name_to_education[line[0]] = 7
			if line[3] == "Masters or professional degree" :
				screen_name_to_education[line[0]] = 6
			if line[3] == "Bachelors" :
				screen_name_to_education[line[0]] = 5
			if line[3] == "Associates" :
				screen_name_to_education[line[0]] = 4 
			if line[3] == "Secondary/High School" :
				screen_name_to_education[line[0]] = 3 
			if line[3] == "Junior secondary/junior high/middle School" :
				screen_name_to_education[line[0]] = 2
			if line[3] == "Elementary/Primary School" :
				screen_name_to_education[line[0]] = 1
			if line[0] not in screen_name_to_education and line[0] != "\\N":
				screen_name_to_education[line[0]] = 0

In [7]:
# Extracting forum feature
with open('../../data/EarthSciences_ResGeo202_Spring2015_Forum.csv', 'r') as csvfile :
	lines = csv.reader(csvfile, delimiter = ',', quotechar = '"')
	for line in lines :
		if len(line) > 2 and line[1] in userScore:
			# print line[1], line[2]
			if line[2] == "CommentThread":
				screen_name_to_post_count[line[1]] += 1
			if line[2] == "Comment":
				screen_name_to_comment_count[line[1]] += 1

In [8]:

# Get nationality features
with open('../../data/EarthSciences_ResGeo202_Spring2015_demographics.csv', 'r') as csvfile :
    lines = csv.reader(csvfile, delimiter = ',', quotechar = '"')
    for line in lines :
        if line[0] in userScore and line[4] != "\\N":
            screen_name_to_nationality[line[0]] = line[4]

In [9]:
# Extracting video counts
first = True
with open('../countVideos/EarthSciences_ResGeo202_Spring2015_UserVideo_Matrix.csv', 'r') as csvfile :
	lines = csv.reader(csvfile, delimiter = ',', quotechar = '"')
	for line in lines :
		if first:
			first = False
			continue
		key = line[0]
		count = 0
		for i in xrange(1, len(line)):
			count += int(line[i])
		videoCounts[key] = count
		countClass = 0
		if count > 18:
			countClass = 1
		videoCountsClassification[key] = countClass
        

In [10]:
le = preprocessing.LabelEncoder()
le.fit(screen_name_to_nationality.values())
encoding_min = min(le.transform(screen_name_to_nationality.values()))
encoding_max = max(le.transform(screen_name_to_nationality.values()))
enc = preprocessing.OneHotEncoder(n_values = encoding_max + 1, sparse = False,dtype='int32')
enc.fit([encoding_min],[encoding_max])


numSamples = len(userScore.keys())
numFeatures = 5 + (encoding_max+1)
X = np.zeros((numSamples, numFeatures))
# Y = np.array([0])
Y = []
count = 0
for id in userScore:
    newrow = [screen_name_to_gender[id], screen_name_to_year_of_birth[id], screen_name_to_education[id], 
              screen_name_to_post_count[id], screen_name_to_comment_count[id]]
    if id in screen_name_to_nationality:
        newrow += enc.transform(le.transform(screen_name_to_nationality[id]))[0,:].tolist()
    else:
        newrow += [0] * (encoding_max + 1)
    X[count, :] = newrow
    count += 1
#     Y.append(userScore[id])
    Y.append(videoCountsClassification[id])



In [11]:
from sklearn import linear_model, datasets, metrics
from sklearn.cross_validation import train_test_split
from sklearn import linear_model, datasets, metrics
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV


logistic_classifier = linear_model.LogisticRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.15,
                                                    random_state=0)

param_grid = {'penalty':['l1','l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
GridSearch = GridSearchCV(logistic_classifier, param_grid, cv = 10)
GridSearch.fit(X_train, Y_train)
bestLRclf = GridSearch.best_estimator_

print("Logistic regression using raw pixel features:\n%s\n" % (
    metrics.classification_report(
        Y_test,
        bestLRclf.predict(X_test))))
#logistic_classifier.fit(X_train, Y_train)


#print logistic_classifier.coef_

Logistic regression using raw pixel features:
             precision    recall  f1-score   support

          0       0.53      0.58      0.56        69
          1       0.55      0.50      0.52        70

avg / total       0.54      0.54      0.54       139




In [12]:
bestLRclf.coef_

array([[ 0.        ,  0.01421821,  0.19353875,  0.09877007,  0.02105472,
        -2.08925479,  0.        ,  0.30814319,  0.18188714,  3.58352564,
         2.0697478 ,  2.04056378,  3.68958638,  2.90386654, -0.20121922,
         1.85021073,  1.01504545,  0.        , -1.00003043, -0.80402304,
        -2.79723816, -0.49570735,  2.12587386,  2.09743664,  0.2712918 ,
         0.89830638, -2.42497643, -3.46702513,  3.37359821, -1.32163752,
         0.71725501,  0.94876401,  2.15431023, -0.31462709,  0.2579936 ,
         0.12825067, -2.59189937, -0.29405608, -2.25435711, -2.22592145,
        -0.64968619, -0.6588328 , -0.5065908 ,  0.        , -0.87584078,
        -0.11568202,  0.32838918,  0.        , -1.49635963,  3.55206639,
         1.93921644,  2.20100486,  0.18823259,  0.        ,  0.        ,
        -0.25979976, -0.19448646, -2.38408463, -1.10918492,  0.32034836,
         1.01410312,  2.06900021, -4.42201593, -0.35496009, -4.28167367,
         0.66012263,  0.98472197, -2.2885398 , -0.8

In [169]:
bestLRclf.predict(X_test[1,:])



array([6])