In [3]:
import pandas as pd
import numpy as np
import timeit
import re
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
from natsort import natsorted
import itertools
from collections import Counter, OrderedDict
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, sqeuclidean
from sklearn import decomposition, preprocessing, cluster, manifold
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_score, ShuffleSplit, RandomizedSearchCV,\
GridSearchCV
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import squareform, pdist
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics.pairwise import pairwise_kernels
from IPython.display import display, HTML


Pandas version is  0.20.2


In [116]:
pd.options.mode.chained_assignment = None

In [118]:
#LOADING DATA


question_table = pd.read_csv('data/app_question',delimiter='|')
questionchoice_table = pd.read_csv('data/app_questionchoice',delimiter=',')
questioncollection_table = pd.read_csv('data/app_questioncollection',delimiter='|')
questioncollectionitem_table = pd.read_csv('data/app_questioncollectionitem',delimiter='|')
questiontag_table = pd.read_csv('data/app_questiontag',delimiter='|')
questiontagvote_table = pd.read_csv('data/app_questiontagvote',delimiter='|')
questionvote_table = pd.read_csv('data/app_questionvote.csv',delimiter='|',dtype='object',parse_dates=['modified_at',\
                                                                                                       'created_at'])
tag_table = pd.read_csv('data/app_tag',delimiter=';',dtype='object')
usercompare_table = pd.read_csv('data/app_usercompare',delimiter='|',dtype='object')
userextra_table = pd.read_csv('data/app_userextra',delimiter='|')
usergeo_table = pd.read_csv('data/app_usergeo',delimiter='|')
geo_table = pd.read_csv('data/app_geo',delimiter='|')

all_users_compared_table = pd.read_csv('data/all_users_compared.csv',delimiter=',',dtype='object')
users_info_table = pd.read_csv('data/users_info.csv',delimiter=',')

#CLEANING DATA

userextra_table.drop(['password','first_name','last_name','photo','email'],axis=1,inplace=True) #delete personal data
users_info_table.drop(['count_tags'],axis=1,inplace=True)

#select last active votes
questionvote_table = questionvote_table[questionvote_table['active']=='t']

#convert to integers 
questionvote_table['object_id'] = questionvote_table['object_id'].astype(int)
questionvote_table['user_id'] = questionvote_table['user_id'].astype(int)
tag_table['id'] = tag_table['id'].astype(int)
question_table['id'] = question_table['id'].astype(int)
#delete duplicate votes
questionvote_table.drop_duplicates(subset=['user_id','object_id'],inplace=True)

#some geo data is faulty in some places, here's a hardcode fix
geo_fix = {'Medway':'England', 'Moray':'Scotland', 'Argyll and Bute':'Scotland', 'Ards':'N. Ireland'}
users_info_table.region = users_info_table.region.replace(geo_fix)

#make gender data more readable
genders={1.0:'male',2.0:'female',3.0:'other'}
users_info_table.gender.replace(genders, inplace=True)
#users_info_table.gender.fillna(value='none',inplace=True)

#replace 0 with NaN so that numpy ignores it in calculations

users_info_table.replace(0,np.NaN,inplace=True)

#convert difference to similarity

party_variables = ['diff_perc_with_labour','diff_perc_with_conservative','diff_perc_with_green', \
            'diff_perc_with_plaidcymru', 'diff_perc_with_ukip', 'diff_perc_with_snp',\
                   'diff_perc_with_liberaldemocrat']
for party_variable in party_variables:
    users_info_table[party_variable] = 100 - users_info_table[party_variable]


#merge tags with questions

questions_with_tags = pd.merge(questiontag_table, tag_table[['id','text']], left_on='tag_id',right_on='id')
questions_with_tags = questions_with_tags.groupby('question_id')['text'].apply(lambda x:', '.join(x.str.lower())).reset_index()
questions_with_tags = pd.DataFrame(questions_with_tags)
questions_with_tags.rename(columns={'text':'tags'},inplace=True)
questions_with_tags = pd.merge(question_table, questions_with_tags,left_on='id',right_on='question_id',how='left')
#questions_with_tags.drop('question_id',axis=1,inplace=True)


all_users_compared_table.difference_percent = all_users_compared_table.difference_percent.astype(float)
all_users_compared_table[['usera_id','userb_id']] = all_users_compared_table[['usera_id','userb_id']].astype(int)
all_users_compared_table.difference_percent = 100 - all_users_compared_table.difference_percent

parties = {'labour': 17351 , 'conservative': 17663, 'green': 17687 ,'plaidcymru': 17689, 'ukip': 17710, 
           'snp': 17711, 'liberaldemocrat': 17692}

for party in parties:
    column_name = 'diff_perc_with_' + party
    print(column_name)
    dummy_df = all_users_compared_table[all_users_compared_table.userb_id==parties[party]][['usera_id','difference_percent']]

   
    merged_df = pd.merge(users_info_table,dummy_df,left_on='id',right_on='usera_id')['difference_percent']
    users_info_table.loc[users_info_table[column_name].isnull(), column_name] = merged_df



diff_perc_with_conservative
diff_perc_with_labour
diff_perc_with_snp
diff_perc_with_liberaldemocrat
diff_perc_with_plaidcymru
diff_perc_with_green
diff_perc_with_ukip


In [5]:
british_users = users_info_table[users_info_table['country']=='United Kingdom']['id'].tolist()

In [6]:
def create_vote_data(question_ids,user_ids,rescale_data=False,agg_function=float,question_threshold=500,\
                     user_threshold=20,fillna_value=0,sort_index=True,cell_type=float):
    
    if user_ids!=None:
        filtered = questionvote_table[questionvote_table['user_id'].isin(user_ids)]
    else:
        filtered = questionvote_table
        
    data = filtered.pivot_table(index='user_id',columns='object_id',values='value',aggfunc=lambda x: agg_function(x))
    
    data.dropna(thresh = question_threshold, axis=1,inplace=True) #'more than 'question_threshold votes for a question'
    data.dropna(thresh = user_threshold, axis=0,inplace=True) #'more than 'user_threshold' votes from a user'
    data.fillna(value = fillna_value, inplace=True)
    
    data.index.names = ['user_id']
    
    if sort_index==True:
        data.sort_index(inplace=True)
        
    data = data.astype(cell_type)
    if rescale_data==True:
        replace_dict = {1.0:-2.0, 2.0:-1.0, 3.0:0.0, 4.0:1.0, 5.0:2.0}
        data.replace(replace_dict, inplace=True)
    print(data.shape[0],'users')
    print(data.shape[1],'questions')
    return data



In [7]:
#SCALING DATA
def scale_data(data, scale_type = preprocessing.StandardScaler()):
    
    if scale_type!=None:
        print('the scaling function is ', scale_type)
        return pd.DataFrame(scale_type.fit_transform(data), columns=data.columns,\
                            index=data.index)
    
    else:
        return data

    


In [119]:

def show_example_plot(labels, centers, colmap):
    #EXAMPLE 3D PLOT. 3 COMPONENTS RECOMMENDED
    
    fig = plt.figure(figsize=(5, 5))

    ax = Axes3D(fig)
    
    colors = list(map(lambda x: colmap[x+1], labels))
    ax.scatter( questions_components['Component 1'], questions_components['Component 2'], questions_components['Component 3'],color=colors, alpha=0.5, edgecolor='k')


    # for idx, centroid in enumerate(centroids):
    #     plt.scatter(*centroid, color=colmap[idx+1])

    plt.xlim(-0.1, 0.1)
    plt.ylim(-0.1, 0.1)
    plt.show()



In [120]:
# PCA
def PCA_items_components(data, n_components,use_projection=False,transpose=False):
    
    pca = decomposition.PCA(n_components = n_components)
    projection = pca.fit_transform(data)
    
    print('Explained variance is ',sum(pca.explained_variance_ratio_))
    if use_projection==False:
        components_names = ['Component '+str(i) for i in range(1,n_components+1)]
        items_components = pd.DataFrame(pca.components_,columns = data.columns, \
                                            index = components_names)
        if transpose==True:
            items_components = items_components.T

    else:
        items_components = pd.DataFrame(projection)

    return items_components



In [121]:
# FA

def FA_total_var(components,noise_variance):

    comp = components
    sumcol = np.sum(comp**2,axis=1)

    print('Explained variance',sum([(100*sumcol[i])/(np.sum(sumcol)+np.sum(noise_variance))
                                    for i in range(components.shape[0])]))
    
    
def FA_items_components(data, n_components,use_projection=False,transpose=False):
    
    fa = decomposition.FactorAnalysis(n_components = n_components)
    projection = fa.fit_transform(data)
    FA_total_var(fa.components_,fa.noise_variance_)
    if use_projection==False:
        components_names = ['Component '+str(i) for i in range(1,n_components+1)]
        items_components = pd.DataFrame(fa.components_,columns = data.columns, \
                                            index = components_names)
        if transpose==True:
            items_components = items_components.T

    else:
        items_components = pd.DataFrame(projection)

    return items_components


In [122]:
#MDS
def MDS_items_components(data, n_components, metric, use_projection=False):
    
    mds = manifold.MDS(n_components = n_components,metric = metric, max_iter=100, verbose=1, n_jobs=1)
    projection = mds.fit_transform(data)
    if use_projection==False:
        print('NOT SUPPORTED')
        return None
    else:
        items_components = pd.DataFrame(projection)
        print('stress value',mds.stress_)
        return items_components


In [123]:
def get_cluster_items_dict(data):

    # cluster : [list of item ids]
    clusters_dict = { cluster : [str(x) for x in data.T[data.T['label']==cluster].index.unique().tolist()] \
                 for cluster in data.T['label']}

    return clusters_dict


In [124]:
def get_item_cluster_dict(data):
        #{item : cluster for every item in data}
        
    cl = {}
    for item in data.columns:
        cl[item] = data[item]['label']
    return cl


In [126]:
def question_analysis_cluster(questions, relevant_columns, threshold=10, verbose=False):
    
    questions_df = questions_with_tags[questions_with_tags.id.isin(questions)]
    
    questions_df = questions_df[[column for column in relevant_columns+['tags']]]
    
    #list of tags for every question
    all_tags = questions_df[questions_df.tags.notnull()].tags.str.cat(sep=' ').split(', ')

    #merge all tags for all questions
    all_tags = ' '.join(s for s in all_tags).split(' ')

    #count the tags
    cnt = Counter(all_tags)
    most_common_tags = [i[0] for i in list(cnt.most_common(threshold))]

    results = OrderedDict()
    
    results_dict = {'avg_liquid_vote_count' : questions_df.liquid_vote_count.mean(),
               'avg_polarisation': questions_df.polarisation.mean(),
               'avg_count_comments': questions_df.count_comments.mean(),
               'avg_liquid_consensus': questions_df.liquid_consensus.mean()}
    
    tags = {'tag '+str(i): most_common_tags[i-1] if i<len(most_common_tags)+1 else 0 for i in range(1,threshold+1) }

    results.update(results_dict)
    
    new_tags = OrderedDict(natsorted(tags.items()))

    results.update(new_tags)

    return results
    


In [12]:
analysis_question_variables = ['id','liquid_vote_count','polarisation','count_comments','liquid_consensus'] 


analysis_user_variables = ['age','gender',
        'region', 'count_comment_votes', 'count_friends',
       'count_group_memberships', 'count_question_votes', 'count_questions',
       'count_votes', 'count_choice_votes','count_following_tags', 'count_comments',\
        'count_followers','count_following_users', 'verification_score', 'verification_count',
       'diff_perc_with_labour', 'diff_perc_with_conservative',
       'diff_perc_with_green', 'diff_perc_with_plaidcymru',
       'diff_perc_with_ukip', 'diff_perc_with_snp',
       'diff_perc_with_liberaldemocrat', 'session_count',
       'count_votes_last_3_months', 'count_choice_votes_last_3_months',
       'count_compares_last_3_months', 'count_comments_last_3_months',
       'count_group_membership_last_3_months', 'count_votes_last_month',
       'count_choice_votes_last_month', 'count_compares_last_month',
       'count_comments_last_month', 'count_group_membership_last_month']



In [13]:
def analyse_clusters(questions, users, analysis_user_variables, analysis_question_variables,
                    cluster_question_dict, display_df=True):
 
    results = []
    index = []
    for some_cluster in cluster_question_dict:
    #     print('CLUSTER ',some_cluster)
    #     print('-----------------------------------------------')
        index.append(some_cluster)
        results.append(user_analysis_cluster(cluster_question_dict[some_cluster], users,analysis_user_variables))
        results[-1].update(question_analysis_cluster(cluster_question_dict[some_cluster],analysis_question_variables))


    #print(results)
    cluster_results = pd.DataFrame(results,index=index)
    
    if display_df=='True':
        display(HTML(cluster_results.iloc[:10,:].to_html()))

    print(cluster_results['num questions'].sum())
    
    return cluster_results

In [127]:

def do_clustering(data, metric,dim_reduction_algorithm, dim_reduction_params, clustering_params, 
                  clustering_algorithm):
    if dim_reduction_params['use_projection']==True:
        data = data.T

    if dim_reduction_algorithm in ['PCA','pca']:
        if dim_reduction_params['use_projection']==True:
            #do PCA
            items_components = PCA_items_components(data,n_components = dim_reduction_params['n_components'],
                                                    use_projection=True)
            use_index = data.index
        else:
            items_components = PCA_items_components(data,n_components = dim_reduction_params['n_components'],
                                                    use_projection=False, transpose=True)
            use_index = data.columns

    elif dim_reduction_algorithm in ['FA','fa','Factor Analysis']:
        if dim_reduction_params['use_projection']==True:
            #do FA
            items_components = FA_items_components(data,n_components = dim_reduction_params['n_components'],
                                                    use_projection=True)
            use_index = data.index
        else:
            items_components = FA_items_components(data,n_components = dim_reduction_params['n_components'],
                                                    use_projection=False, transpose=True)
            use_index = data.columns
    elif dim_reduction_algorithm in ['MDS','mds']:
        items_components = MDS_items_components(data, n_components = dim_reduction_params['n_components'],
                                                metric = dim_reduction_params['metric'],use_projection=True)
        use_index = data.index
    else:
        items_components = None
        
    #do clustering
    if clustering_algorithm==linkage:
        if items_components is not None:
            data_new = items_components
            data_new.index = use_index
            data_new.columns = ['Component '+str(i) for i in range(1,dim_reduction_params['n_components']+1)]
            data = data_new
        Z = pdist(data,clustering_params['metric'])
        alg_with_params = linkage(Z, clustering_params['method'])
        distance = squareform(Z)
        hierarchy_clusters = fcluster(alg_with_params, clustering_params['t'], criterion=clustering_params['criterion'],depth=25)
        
        item_cluster_dict={item : hierarchy_clusters[i] for i, item in enumerate(data.index)}
        cluster_items_dict = {i: [item for item, value in item_cluster_dict.items() if value==i] for i in item_cluster_dict.values()}
    
    else:
            
        alg_with_params = clustering_algorithm(**clustering_params)
        distance = pairwise_distances(data, metric=metric)
        distance_scaled = preprocessing.MinMaxScaler().fit_transform(distance)
        
        if clustering_algorithm==cluster.KMeans:
            labels = alg_with_params.fit_predict(items_components)   

        elif clustering_algorithm==cluster.DBSCAN:
            similarity = 1 - distance_scaled
            #print('shape is ',similarity.shape)
            labels = alg_with_params.fit_predict(similarity)

        elif clustering_algorithm==cluster.AgglomerativeClustering or\
        clustering_algorithm==cluster.SpectralClustering:
            labels = alg_with_params.fit_predict(distance) 

        #columns are items, label is the one and only row
        item_clusters = pd.DataFrame(labels,columns = ['label'], \
                                            index = use_index).T

        #print('len of  clusters column', len(item_clusters.columns))
        #display(HTML(item_clusters .iloc[:10,:10].to_html()))

        # cluster: ['list of question ids']
        cluster_items_dict = get_cluster_items_dict(item_clusters)

        # question: cluster
        item_cluster_dict = get_item_cluster_dict(item_clusters)

    return distance, cluster_items_dict, item_cluster_dict, items_components, alg_with_params






In [15]:
relevant_question_variables = ['id','liquid_vote_count','polarisation','count_comments','liquid_consensus'] 


relevant_user_variables = ['age','gender',
        'region', 'count_comment_votes', 'count_friends',
       'count_group_memberships', 'count_question_votes', 'count_questions',
       'count_votes', 'count_choice_votes','count_following_tags', 'count_comments_user',\
        'count_followers','count_following_users', 'verification_score', 'verification_count',
       'diff_perc_with_labour', 'diff_perc_with_conservative',
       'diff_perc_with_green', 'diff_perc_with_plaidcymru',
       'diff_perc_with_ukip', 'diff_perc_with_snp',
       'diff_perc_with_liberaldemocrat', 'session_count',
       'count_votes_last_3_months', 'count_choice_votes_last_3_months',
       'count_compares_last_3_months', 'count_comments_last_3_months',
       'count_group_membership_last_3_months', 'count_votes_last_month',
       'count_choice_votes_last_month', 'count_compares_last_month',
       'count_comments_last_month', 'count_group_membership_last_month']

relevant_time_variables = ['modified_at','created_at']

categorical_variables = ['region','gender','question_cluster', 'user_cluster']

In [84]:
def get_sl_data(questions, users, question_clusters, user_clusters, user_variables,
                time_variables, question_variables, categorical_variables, ignore_unanswered=True, tag_threshold=40):
    questionvote_filtered = questionvote_table[(questionvote_table\
                                            ['object_id'].isin(questions))\
                                           & questionvote_table['user_id'].isin(users)]

    sl_data = pd.merge(questionvote_filtered, users_info_table, left_on='user_id',right_on='id')
    
    questions_with_tags_filtered = questions_with_tags[questions_with_tags.id.isin(questions)]
    
    questions_with_tags_filtered = questions_with_tags_filtered[question_variables+['tags']]
    
    sl_data = pd.merge(sl_data, questions_with_tags_filtered,left_on='object_id',right_on='id',\
                          suffixes=('_user','_question'))
    
    for time_variable in time_variables:
        sl_data[time_variable] = sl_data[time_variable].astype(int)

    sl_data = shuffle(sl_data)
    
    if ignore_unanswered:
        sl_data.dropna(subset=['value'],inplace=True) 
        
    if type(question_clusters)==dict:
        sl_data['question_cluster'] = sl_data['object_id'].map(question_clusters)
    if type(user_clusters)==dict:
        sl_data['user_cluster'] = sl_data['user_id'].map(user_clusters)
    print('SL DATA COLUMNS',sl_data.columns) 
    new_question_variables = ['liquid_vote_count','polarisation','count_comments_question','liquid_consensus'] 
    X = sl_data[[column for column in relevant_user_variables +  relevant_time_variables + new_question_variables +['object_id'] if column\
                             not in categorical_variables]] #forget about categorical for now 

#------------------------------------TAGS FEATURE ENGINEERING - CURRENTLY OMITTED------------------#
#     questions_df = questions_with_tags[questions_with_tags.id.isin(questions)]
#     #list of tags for every question
#     all_tags = re.split(', |\|',questions_df.fillna(value='None').tags.str.cat(sep='|'))

#     #count the tags
#     cnt = Counter(all_tags)
#     most_common_tags = [i[0] for i in list(cnt.most_common(tag_threshold))]

#     q_with_list_tags = questions_df['tags'].fillna(value='None').\
#     apply(lambda x: x.split(', '))

#     mlb = preprocessing.MultiLabelBinarizer()
#     tags = mlb.fit_transform(q_with_list_tags)   
#     #print(tags)
#     tags = pd.DataFrame(tags, columns=mlb.classes_,index=questions_df.id)[most_common_tags]

    #X = pd.merge(X,tags,right_index=True,left_on='object_id')

    X.rename(columns={'count_comments':'count_comments_question'},inplace=True)
    X[['modified_at','created_at']]= preprocessing.MinMaxScaler().fit_transform(X[['modified_at','created_at']])
    for category in categorical_variables:
        X = pd.concat([X, pd.get_dummies(sl_data[category],prefix=category,dummy_na=True)],axis=1)
        
    #print('len of train data', len(X), 'columns: ', X.columns.tolist()) 
    X.fillna(value=0, inplace=True)
    X.drop(['object_id'],axis=1,inplace=True)
    #print(X.columns[20:])
    Y = sl_data['value']
    return X,Y

def gridsearch_cv(X,Y,estimator, cv, params, scoring, verbose, error_score='raise', n_jobs=-1):
    gsv = GridSearchCV(estimator=estimator, param_grid=params, cv=cv,\
                             scoring=scoring, verbose=verbose, error_score=error_score,n_jobs=n_jobs)
    gsv.fit(X,Y)
    return gsv


In [44]:
kmeans_params_u_grid = {'init':['k-means++'], 'n_clusters':[20,40,50],'n_init':[10]} 
kmeans_params_q_grid = {'init':['k-means++'], 'n_clusters':[10,15,20],'n_init':[5]}#'n_jobs':[-1]} 
hierarchy_params_q = {'metric':['euclidean','correlation','cityblock'],'method':['ward'],'criterion':['maxclust'],'t':[5,10,20]}
hierarchy_params_u = {'metric':['euclidean','correlation','cityblock'],'method':['ward'],'criterion':['maxclust'],'t':[5,20,40]}
spectral_poly_u = {'n_init':[10],'n_clusters':[10,20],'affinity':['sigmoid'],'n_jobs':[-1]}
spectral_poly_q = {'n_init':[5],'n_clusters':[5,10],'affinity':['sigmoid'],'n_jobs':[-1]}

dim_reduction_params_grid = {'n_components':[5,20,50],'use_projection':[True]}
mds_params_grid = {'n_components':[50],'metric':[True, False],'use_projection':[True]}
data = create_vote_data(None, british_users, question_threshold=500,
                            user_threshold=20, fillna_value=0, rescale_data=False)
data = scale_data(data)

users = data.index.tolist() #users with threshold
questions = data.columns.tolist() #questions with threshold


metric = 'euclidean'
def my_gridsearch(model,clustering,dim_red,clust_params_u,clust_params_q,start=None, threshold=None):
    start = timeit.default_timer()
    
    dim_red_combinations = [dict(zip(dim_red, values)) 
                for values in itertools.product(*dim_red.values())]
    clust_u_combinations = [dict(zip(clust_params_u, values)) 
                for values in itertools.product(*clust_params_u.values())]
    clust_q_combinations = [dict(zip(clust_params_q, values)) 
                for values in itertools.product(*clust_params_q.values())]
    final_comb = list(itertools.product(dim_red_combinations, clust_u_combinations,clust_q_combinations))
    print('the len of final comb is', len(final_comb))
    print('the input sizes are: ',len(dim_red_combinations),'x',len(clust_u_combinations),
          'x',len(clust_q_combinations))
    scores = {}
    if threshold!=None:
        final_comb = final_comb[:threshold]
        
    for i, params_set in enumerate(final_comb):
        print('fitting '+str(i+1)+' out of '+str(len(final_comb)))
        dim_params = params_set[0]
        #print(dim_params)
        user_params = params_set[1]
        print(user_params)
        question_params = params_set[2]
        s = 'dim_params: '+ str(['{} {}'.format(k,v) for k,v in dim_params.items()])\
        + ' user_params: ' + str(['{} {}'.format(k,v) for k,v in user_params.items()])\
        + ' question_params: ' + str(['{} {}'.format(k,v) for k,v in question_params.items()]) 
        for iteration in range(1):
            question_distance, cluster_question_dict, question_cluster_dict, questions_components, alg_q  = do_clustering(data.T, metric,
                                                                                                              'PCA',dim_params,
                                                                                                              question_params,clustering)

            user_distance, cluster_user_dict, user_cluster_dict,users_components, alg_u = do_clustering(data, metric,
                                                                                            'PCA',dim_params,
                                                                                                user_params,clustering)

            X,Y = get_sl_data(questions, users, question_cluster_dict, user_cluster_dict, relevant_user_variables,
                  relevant_time_variables, relevant_question_variables, categorical_variables)
            res = cross_val_score(model,X,Y,cv=3)
            if type(scores.get(s))!=type(None):
                if np.mean(res) > np.mean(scores.get(s)):
                    scores[s] = res
            else:
                scores[s] = res
    end = timeit.default_timer() - start
    print('took {:.3f}s'.format(end))
    return scores

sgd = SGDClassifier(max_iter=300,loss='hinge',verbose=0,n_jobs=-1)
rf = RandomForestClassifier(n_estimators=100,max_features=None,oob_score=False,verbose=0,n_jobs=-1)

a = my_gridsearch(sgd,cluster.SpectralClustering,dim_reduction_params_grid,spectral_poly_u,spectral_poly_q)


4015 users
268 questions
the scaling function is  StandardScaler(copy=True, with_mean=True, with_std=True)
the len of final comb is 12
the input sizes are:  3 x 2 x 2
fitting 1 out of 12
{'n_init': 10, 'n_clusters': 10, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.657873960269




Explained variance is  0.641746839654
fitting 2 out of 12
{'n_init': 10, 'n_clusters': 10, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.657873960269




Explained variance is  0.641746839655
fitting 3 out of 12
{'n_init': 10, 'n_clusters': 20, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.657873960269




Explained variance is  0.641746839652
fitting 4 out of 12
{'n_init': 10, 'n_clusters': 20, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.657873960269




Explained variance is  0.641746839656
fitting 5 out of 12
{'n_init': 10, 'n_clusters': 10, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.753920256939




Explained variance is  0.734065751476
fitting 6 out of 12
{'n_init': 10, 'n_clusters': 10, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.753939087835




Explained variance is  0.734073947913
fitting 7 out of 12
{'n_init': 10, 'n_clusters': 20, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.753899589696




Explained variance is  0.734038769878
fitting 8 out of 12
{'n_init': 10, 'n_clusters': 20, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.753866619619




Explained variance is  0.734021307846
fitting 9 out of 12
{'n_init': 10, 'n_clusters': 10, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.82211668862




Explained variance is  0.807529353757
fitting 10 out of 12
{'n_init': 10, 'n_clusters': 10, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.82213436936




Explained variance is  0.807766044679
fitting 11 out of 12
{'n_init': 10, 'n_clusters': 20, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.822440229742




Explained variance is  0.807577966128
fitting 12 out of 12
{'n_init': 10, 'n_clusters': 20, 'n_jobs': -1, 'affinity': 'sigmoid'}
Explained variance is  0.822540644822




Explained variance is  0.807680520042
took 1381.343s


In [46]:
mean_res={}
for i in a:
    mean_res[i] = np.mean(sorted(a[i])[-2:])
max(mean_res,key=mean_res.get)


"dim_params: ['n_components 20', 'use_projection True'] user_params: ['n_init 10', 'n_clusters 10', 'n_jobs -1', 'affinity sigmoid'] question_params: ['n_init 5', 'n_clusters 5', 'n_jobs -1', 'affinity sigmoid']"

In [92]:
gbc = GradientBoostingClassifier(n_estimators=50,verbose=2, max_depth=10,min_samples_split=2, min_samples_leaf=1)

gbc.fit(X[:int(2/3*(len(X)))],Y[:int(2/3*(len(X)))])

      Iter       Train Loss   Remaining Time 
         1      290171.8982           52.47m
         2      276874.9095           51.67m
         3      266191.0163           50.33m
         4      257346.2013           48.46m
         5      249832.9304           46.69m
         6      243339.4049           45.67m
         7      237819.2908           44.50m
         8      232825.4191           43.28m
         9      228469.8929           41.95m
        10      224679.8726           40.70m
        11      221317.3095           39.44m
        12      218285.1144           38.25m
        13      215493.2852           37.02m
        14      213021.0738           35.81m
        15      210726.2595           34.64m
        16      208615.2553           33.47m
        17      206582.2537           32.29m
        18      204700.7168           31.18m
        19      202906.3473           30.09m
        20      201397.4551           28.90m
        21      199706.1918           27.80m
        2

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=2,
              warm_start=False)

In [103]:
feat_imp=pd.DataFrame(gbc.feature_importances_*100, index=X.columns, columns=['importance']).sort_values(by='importance',ascending=False)

feat_imp.iloc[:10,:].plot(kind='bar')
plt.show()
