In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold


### Loading data

There is an error with discussion_questions.csv and discussion_answers.csv which causes almost half the lines to be removed. This error is caused because of incorrect formating in the csv since commas in the discussion contents are being recognized as delimiters.

delete all `{\"dtdId\":\"discussion/1\",\"value\":\"<co-content>`

delete all `</co-content>\"}`

replace all `\\\" with ""`

In [62]:
disc_qs = pd.read_csv("MTC508/discussion_questions.csv")
disc_qs_votes = pd.read_csv("MTC508/discussion_question_votes.csv")
disc_qs_followings = pd.read_csv("MTC508/discussion_question_followings.csv")
disc_ans = pd.read_csv("MTC508/discussion_answers.csv")
disc_ans_votes = pd.read_csv("MTC508/discussion_answer_votes.csv")
users = pd.read_csv("MTC508/users.csv")
target = pd.read_csv("MTC508/MTC508_roster_outputlabel_Jaeyoon.csv")

Select important column from the user table

In [63]:
# selected_user_columns = ["umich_user_id", "country_cd", "reported_or_inferred_gender"]
selected_user_columns = ["umich_user_id", "country_cd", "reported_or_inferred_gender","educational_attainment"]
# selected_user_columns = ["umich_user_id"]

df = pd.DataFrame(users)[selected_user_columns]

Get the total number of discussion answers a user has posted

In [64]:
answer_counts = disc_ans.groupby('umich_discussions_user_id').size().reset_index(name='answer_count')
df = pd.merge(df, answer_counts, left_on='umich_user_id', right_on='umich_discussions_user_id', how='left')
df.drop('umich_discussions_user_id', axis=1, inplace=True)
df['answer_count'].fillna(0, inplace=True)

Get the average length of a discussion answer by a user

In [65]:
# Function to calculate the average answer length for a user
def calculate_average_answer_length(user_id):
    user_answers = disc_ans[disc_ans['umich_discussions_user_id'] == user_id]
    # Handle the case where there are no answers for the user
    if len(user_answers) == 0:
        return 0  
    total_length = user_answers['discussion_answer_content'].str.len().sum()
    return total_length / len(user_answers)

df['average_answer_length'] = df['umich_user_id'].apply(calculate_average_answer_length)

Get the total number of votes given to discussion answers by a user

In [66]:
user_vote_counts = disc_ans_votes.groupby('umich_discussions_user_id')['discussion_answer_vote_value'].sum().reset_index(name='total_votes_given_answers')
df = pd.merge(df, user_vote_counts, left_on='umich_user_id', right_on='umich_discussions_user_id', how='left')
df.drop('umich_discussions_user_id', axis=1, inplace=True)
df['total_votes_given_answers'].fillna(0, inplace=True)

Get the total number of votes received on discussion answers by a user

In [67]:
answer_vote_counts_received = disc_ans_votes.groupby('discussion_answer_id')['discussion_answer_vote_value'].sum().reset_index(name='total_votes_received_answers')
discussion_answers_with_votes = pd.merge(disc_ans, answer_vote_counts_received, on='discussion_answer_id', how='left')
discussion_answers_with_votes = discussion_answers_with_votes[['umich_discussions_user_id', 'total_votes_received_answers']]
votes = pd.merge(df, discussion_answers_with_votes, left_on='umich_user_id', right_on='umich_discussions_user_id', how='left')
votes.drop('umich_discussions_user_id', axis=1, inplace=True)
votes['total_votes_received_answers'].fillna(0, inplace=True)
votes = votes.groupby('umich_user_id')['total_votes_received_answers'].sum()
df = pd.merge(df, votes, on='umich_user_id', how='left')


Get the total number of discussion questions a user has posted

In [68]:
questions_counts = disc_qs.groupby('umich_discussions_user_id').size().reset_index(name='question_count')
df = pd.merge(df, questions_counts, left_on='umich_user_id', right_on='umich_discussions_user_id', how='left')
df.drop('umich_discussions_user_id', axis=1, inplace=True)
df['question_count'].fillna(0, inplace=True)

Get the average length of a discussion question by a user

In [69]:
# Function to calculate the average answer length for a user
def calculate_average_question_length(user_id):
    user_questions = disc_qs[disc_qs['umich_discussions_user_id'] == user_id]
    # Handle the case where there are no answers for the user
    if len(user_questions) == 0:
        return 0  
    total_length = user_questions['discussion_question_details'].str.len().sum()
    return total_length / len(user_questions)

df['average_question_length'] = df['umich_user_id'].apply(calculate_average_answer_length)

Get the total number of votes given to discussion questions by a user

In [70]:
user_vote_counts = disc_qs_votes.groupby('umich_discussions_user_id')['discussion_question_vote_value'].sum().reset_index(name='total_votes_given_questions')
df = pd.merge(df, user_vote_counts, left_on='umich_user_id', right_on='umich_discussions_user_id', how='left')
df.drop('umich_discussions_user_id', axis=1, inplace=True)
df['total_votes_given_questions'].fillna(0, inplace=True)

Get the total number of votes received on discussion questions by user

In [71]:
question_vote_counts_received = disc_qs_votes.groupby('discussion_question_id')['discussion_question_vote_value'].sum().reset_index(name='total_votes_received_questions')
discussion_questions_with_votes = pd.merge(disc_ans, question_vote_counts_received, on='discussion_question_id', how='left')
discussion_questions_with_votes = discussion_questions_with_votes[['umich_discussions_user_id', 'total_votes_received_questions']]
votes = pd.merge(df, discussion_questions_with_votes, left_on='umich_user_id', right_on='umich_discussions_user_id', how='left')
votes.drop('umich_discussions_user_id', axis=1, inplace=True)
votes['total_votes_received_questions'].fillna(0, inplace=True)
votes = votes.groupby('umich_user_id')['total_votes_received_questions'].sum()
df = pd.merge(df, votes, on='umich_user_id', how='left')


Get the number of discussion questions a user is following

In [72]:
merged_df = pd.merge(df, disc_qs_followings, left_on='umich_user_id', right_on='umich_discussions_user_id', how='left')

# Group by 'umich_discussions_user_id' and count the number of questions followed by each user
user_question_counts = merged_df.groupby('umich_discussions_user_id')['discussion_question_following_active'].count().reset_index()

# Rename the columns
user_question_counts.columns = ['umich_user_id', 'total_questions_following']

# Merge user_question_counts back into the final_df 
df = pd.merge(df, user_question_counts, on='umich_user_id', how='left')

# Replace NaN values in the 'total_questions_following' column with 0
df['total_questions_following'].fillna(0, inplace=True)

In [73]:
target.drop('id', axis=1, inplace=True)
df = pd.get_dummies(df, columns=['country_cd', 'reported_or_inferred_gender', 'educational_attainment'])

In [74]:
intl_countries = ['country_cd_AE', 'country_cd_AG', 'country_cd_AT',
       'country_cd_AU', 'country_cd_BD', 'country_cd_BE', 'country_cd_CA',
       'country_cd_CN', 'country_cd_FR', 'country_cd_GB', 'country_cd_GR',
       'country_cd_GT', 'country_cd_GY', 'country_cd_HK', 'country_cd_ID',
       'country_cd_IL', 'country_cd_IN', 'country_cd_IQ', 'country_cd_IT',
       'country_cd_JO', 'country_cd_JP', 'country_cd_KR', 'country_cd_KW',
       'country_cd_KY', 'country_cd_LC', 'country_cd_MX', 'country_cd_NG',
       'country_cd_NL', 'country_cd_NZ', 'country_cd_OM', 'country_cd_PK',
       'country_cd_PL', 'country_cd_PT', 'country_cd_PY', 'country_cd_SG',
       'country_cd_SL', 'country_cd_SX', 'country_cd_SZ', 'country_cd_TH',
       'country_cd_UA', 'country_cd_VG', 'country_cd_VI',
       'country_cd_ZA']
df.drop(intl_countries, axis=1, inplace=True)

In [75]:
df.rename(columns={'reported_or_inferred_gender_male': 'is_male'}, inplace=True)
df.drop('reported_or_inferred_gender_female', axis=1, inplace=True)

In [76]:
df['bachelor_obtained'] = df[['educational_attainment_BACHELOR_DEGREE',
       'educational_attainment_DOCTORATE_DEGREE',
       'educational_attainment_MASTERS_DEGREE',
       'educational_attainment_PROFESSIONAL_DEGREE']].any(axis=1).astype(int)

In [77]:
df.drop(['educational_attainment_ASSOCIATE_DEGREE',
       'educational_attainment_BACHELOR_DEGREE',
       'educational_attainment_COLLEGE_NO_DEGREE',
       'educational_attainment_DOCTORATE_DEGREE',
       'educational_attainment_HIGH_SCHOOL_DIPLOMA',
       'educational_attainment_LESS_THAN_HIGH_SCHOOL_DIPLOMA',
       'educational_attainment_MASTERS_DEGREE',
       'educational_attainment_PROFESSIONAL_DEGREE'], axis=1, inplace=True)

In [78]:
df[['country_cd_US', 'is_male', 'bachelor_obtained']] = df[['country_cd_US', 'is_male', 'bachelor_obtained']].astype(int)

In [79]:
df = pd.merge(df, target, on='umich_user_id', how='left')
df.drop('essentials_of_social_welfare_policy_user_id', axis=1, inplace=True)

In [80]:
df.to_csv('preprocessed.csv')

### Logistic Regression

In [81]:
# X = final_df.drop('completed', axis=1)
# y = final_df['completed']

# model = LogisticRegression()

# scores = cross_val_score(model, X, y, cv=10)

# for i, score in enumerate(scores, 1):
#     print(f'Fold {i}: Accuracy = {score:.2f}')


# mean_accuracy = scores.mean()
# std_accuracy = scores.std()
# print(f'Mean Accuracy = {mean_accuracy:.2f}')
# print(f'Standard Deviation = {std_accuracy:.2f}')

In [82]:
# model.fit(X, y)
# coefficients = model.coef_
# coefficients_dict = (np.std(X, 0)*(model.coef_[0])).to_dict()
# # print(model.coef_)
# # coefficients_dict = dict(zip(X.columns, coefficients[0]))
# print(coefficients_dict)

In [83]:
# len(coefficients_dict)

In [84]:
# sorted(list(coefficients_dict.items()), key=lambda x : abs(x[1]), reverse=True)

In [85]:
# X = X-np.mean(X, axis=0)
# X = X/np.std(X, axis=0)
# X

In [86]:
# scores = cross_val_score(model, X, y, cv=10)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# # for i, score in enumerate(scores, 1):
# #     print(f'Fold {i}: Accuracy = {score:.2f}')

# # mean_accuracy = scores.mean()
# # std_accuracy = scores.std()

# # print(f'Mean Accuracy = {mean_accuracy:.2f}')
# # print(f'Standard Deviation = {std_accuracy:.2f}')

# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# auc = roc_auc_score(y_test, y_pred)

# # Print the AUC score
# print(f'AUC: {auc:.2f}')

# coefficients = model.coef_
# coefficients_dict = (np.std(X, 0)*(model.coef_[0])).to_dict()

# sorted(list(coefficients_dict.items()), key=lambda x : abs(x[1]), reverse=True)

In [87]:
# final_df = pd.read_csv('test.csv')

In [88]:
# df.columns

In [89]:
# df['educational_attainment_BACHELOR_DEGREE'].value_counts()

In [90]:
# # Calculate the confusion matrix
# conf_matrix = confusion_matrix(y_test, y_pred)

# # Extract FP and TN from the confusion matrix
# FP = conf_matrix[0, 1]  # False Positives
# TN = conf_matrix[1, 1]  # True Negatives

# # Calculate FPR
# FPR = FP / (FP + TN)
# print("False Positive Rate:", FPR)

In [91]:
# X = df.drop('completed', axis=1)
# y = df['completed']

# model = LogisticRegression()

# kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# aucs = []
# fprs = []

# # print(X.shape, y.shape)
# # print(y)

# y = np.array(y)
# X = np.array(X)
# for train_index, test_index in kfold.split(X):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     auc = roc_auc_score(y_test, y_pred)
#     aucs.append(auc)

#     # Print the AUC score
#     # print(f'AUC: {auc:.2f}')
    
#     # Train and evaluate your model on the current fold
#     # You can replace this with your model training and evaluation code
#     conf_matrix = confusion_matrix(y_test, y_pred)

#     # Extract FP and TN from the confusion matrix
#     FP = conf_matrix[0, 1]  # False Positives
#     TN = conf_matrix[1, 1]  # True Negatives

#     # Calculate FPR
#     FPR = FP / (FP + TN)
#     fprs.append(FPR)
#     # print("False Positive Rate:", FPR)

# auc_avg = np.average(aucs)
# auc_std = np.std(aucs)
# fpr_avg = np.average(fprs)
# fpr_std = np.std(fprs)


In [92]:
# print(f'AUC: {auc_avg:.2f} +/- {auc_std:.2f}')
# print(f'FPR: {fpr_avg:.2f} +/- {fpr_std:.2f}')