# Machine learning algorithms

## Social Network analysis algorithms

### Edges

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression , Ridge , Lasso

In [None]:
#Create 2 lists to separate the values from the index which correspond to edges
edge1 = []
edge2 = []
for i in range(0,len(future_connections)):
    edge1.append(future_connections.index[i][0])
for i in range(0,len(future_connections)):
    edge2.append(future_connections.index[i][1])
    
#Create columns with the values from the lists we just created  
future_connections["edge1"] = edge1
future_connections["edge2"] = edge2

#Compute centrality measures for each edge and create columns with those values
#Common neighbors centrality 
common_neighbors = []
for i in range(0,len(future_connections)):
    common_neighbors.append(len(list(nx.common_neighbors(G,future_connections["edge1"][i],future_connections["edge2"][i]))))
future_connections["common_neighbors"] = common_neighbors   

#Jaccard coefficients centrality
jaccard_coefficients = list(nx.jaccard_coefficient(G, list(future_connections.index))) 
jaccard_coefficient = []
for i in range(0,len(jaccard_coefficients)):
    jaccard_coefficient.append(jaccard_coefficients[i][2])   
future_connections["jaccard_coefficients"] = jaccard_coefficient  

#Resource allocation index centrality
resource_allocation_index = list(nx.resource_allocation_index(G, list(future_connections.index))) 
resource_allocation_indexes = []
for i in range(0,len(resource_allocation_index)):
    resource_allocation_indexes.append(resource_allocation_index[i][2])    
future_connections["resource_allocation_index"] = resource_allocation_indexes

#Adamic adar index centrality
adamic_adar_index = list(nx.adamic_adar_index(G, list(future_connections.index)))
adamic_adar_indexes = []
for i in range(0,len(adamic_adar_index)):
    adamic_adar_indexes.append(adamic_adar_index[i][2])
future_connections["adamic_adar_index"] = adamic_adar_indexes

#Preferential attachment centrality
preferential_attachment = list(nx.preferential_attachment(G, list(future_connections.index)))
preferential_attachments = []
for i in range(0,len(adamic_adar_index)):
    preferential_attachments.append(preferential_attachment[i][2])
future_connections["preferential_attachment"] = preferential_attachments

#Drop the columns we created with the nodes values
future_connections.drop("edge1", axis = 1, inplace =True)
future_connections.drop("edge2", axis = 1, inplace =True)

#Create set we will predict 
to_be_predicted = future_connections[future_connections["Future Connection"].isnull()]
to_be_predicted = to_be_predicted.iloc[:,1:]

#Create data
complete_data = future_connections[future_connections["Future Connection"].notnull()]
data = complete_data.iloc[:,1:]

#Create target
target = complete_data["Future Connection"]

#Train test split
X_train , X_test , y_train , y_test = train_test_split(data , target)

#Rodge regression model
clf = Ridge()
clf.fit(X_train , y_train)

#Predict x_test values
y_scores = clf.predict(X_test)

#Compute auc score
score = roc_auc_score(y_test, y_scores)

#Predic final set
prediction = clf.predict(to_be_predicted)

#Return Series with the prediction probabilities and its index
serie = pd.Series(prediction ,to_be_predicted.index )
Serie

Print("Model with 0.91 accuracy")

### Nodes

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression , Ridge , Lasso


#Create a dataframe using networkx Graph nodes.
df = pd.DataFrame(index=G.nodes())

#Then create new columns which use node attributes and centrality measures data
df['Department'] = pd.Series(nx.get_node_attributes(G, 'Department'))
df['ManagementSalary'] = pd.Series(nx.get_node_attributes(G, 'ManagementSalary'))
df["Clustering"] = pd.Series(nx.clustering(G))
df["Degree"] = pd.Series(nx.degree(G))
df["closeness_centrality"] = pd.Series(nx.closeness_centrality(G))
df["betweenness_centrality"] = pd.Series(nx.betweenness_centrality(G))
df["degree_centrality"] = pd.Series(nx.degree_centrality(G))
df["page_rank"] = pd.Series(nx.pagerank(G))

#We sort the df using the target values column in order to organize it
df = df.sort_values("ManagementSalary")

#We create the data and target variables 
data = df.loc[:, df.columns != 'ManagementSalary']
target = df.loc[:,"ManagementSalary"]
    
#We use slicing to create the variable set which we should predict later
to_be_predicted = data[753:]
data = data[:753]
target = target[:753]

#tran_test_split
X_train , X_test , y_train , y_test = train_test_split(data , target)

#And we create a ridge model fitting it with the X_train and y train data
clf = Lasso()
clf.fit(X_train , y_train)

#We predict the test set from which we do have y_true values in order to get a AUC model score
y_scores = clf.predict(X_test)
score = roc_auc_score(y_test, y_scores)

#prediction = clf.Predict_proba(to_be_predicted)
prediction = clf.predict(to_be_predicted)
serie = pd.Series(prediction ,to_be_predicted.index )
serie

print("Model with <0.80 auc score accuracy")

## GridSearch and LabelEncoder

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Read comma-separated files into DataFrames 
data= pd.read_csv("readonly/train.csv",encoding = "ISO-8859-1")
test_set = pd.read_csv("readonly/test.csv", encoding = "ISO-8859-1")
adresses_set = pd.read_csv("readonly/addresses.csv", encoding = "ISO-8859-1")
latlons_set = pd.read_csv("readonly/latlons.csv", encoding="ISO-8859-1")

#Eliminate "Compliance" column  nan values
data = data[np.isfinite(data["compliance"])]

#Merge "adresses_set" and "latlons_set" on overlapping "adress" column 
merged_sets= pd.merge(left_on="address", right_on="address", left=adresses_set, right=latlons_set)

#Merge "data" and the DataFrame above on "ticket_id"
merged1= pd.merge(left_on="ticket_id", right_on="ticket_id", left=data, right= merged_sets)

#Create the test DataFrame with needed values from "adresses_set" and "latlons_set"
test_merged1= pd.merge(on="ticket_id", left=test_set, right=merged_sets)

#Set "ticket_id" as index for both DataFrames
merged1.set_index("ticket_id", inplace=True)
test_merged1.set_index("ticket_id", inplace=True)

#We will use only values from United States
merged2= merged1[merged1.country=="USA"]

#Create dictionaries with the coulumns that we dont need
drop = ["payment_amount","payment_date","payment_status","balance_due","collection_status","compliance_detail",
        "agency_name","violator_name","violation_street_number","violation_street_name","violation_zip_code",
        "mailing_address_str_number","mailing_address_str_name","city","state","zip_code","non_us_str_code",
        "country","ticket_issued_date","hearing_date","violation_description","inspector_name","address",
        "grafitti_status","violation_code"]
drop2=  ["agency_name","violator_name","violation_street_number","violation_street_name","violation_zip_code",
        "mailing_address_str_number","mailing_address_str_name","city","state","zip_code","non_us_str_code",
         "country","ticket_issued_date","hearing_date","violation_description","inspector_name","address",
         "grafitti_status","violation_code"]

#Drop dictionaries
merged2.drop(drop , axis=1, inplace=True)
test_merged1.drop(drop2, axis= 1 , inplace=True)

#Use Label enconder to enconde values as numbers
label_encoder= LabelEncoder()
label_encoder.fit(merged2['disposition'])
merged2['disposition'] = label_encoder.transform(merged2['disposition'])

label_encoder= LabelEncoder()
label_encoder.fit(test_merged1['disposition'])
test_merged1['disposition'] = label_encoder.transform(test_merged1['disposition'])

#Set target
data_target = merged2.loc[:, "compliance"]

#Set data 
merged2.drop("compliance", axis=1, inplace = True)
data = merged2

#Train_test_split
X_train , X_test , y_train, y_test = train_test_split(data , data_target)

#Fill nan values with the columns mean for both data sets
X_train["lat"]=X_train["lat"].fillna(X_train["lat"].mean())
X_train["lon"]=X_train["lon"].fillna(X_train["lon"].mean())
X_test["lat"]=X_test["lat"].fillna(X_train["lat"].mean())
X_test["lon"]=X_test["lon"].fillna(X_train["lon"].mean())

test_merged1["lat"]=test_merged1["lat"].fillna(test_merged1["lat"].mean())
test_merged1["lon"]=test_merged1["lon"].fillna(test_merged1["lon"].mean())

#RandomForest Classifier
clf = RandomForestClassifier()

#And we use GridSearch for selecting the best parameters
grid_values = {'n_estimators' : [100,110], 'max_depth': [5,10]}
grid_clf_auc = GridSearchCV(clf, param_grid=grid_values, scoring='roc_auc')
grid_clf_auc.fit(X_train, y_train)

#Return gri-auc score as a Series for the prediction set
pd.Series(grid_clf_auc.predict_proba(test_merged1)[:,1], index=test_merged1.index)

print("Model with 0.91 auc score accuracy")