# Import libraries and necessary files

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np 
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read data files
users = pd.read_csv('users.csv')
repo = pd.read_csv('repositories.csv')

# Who are the top 5 users in Chicago with the highest number of followers? List their login in order, comma-separated.

In [None]:
users = users.sort_values(by ='followers', ascending=False)

In [None]:
users["login"][:5].tolist()

# Who are the 5 earliest registered GitHub users in Chicago? List their login in ascending order of created_at, comma-separated.

In [None]:
users['created_at'] = pd.to_datetime(users['created_at'], errors='coerce')
users = users.sort_values(by='created_at')
users.head(5)["login"].to_list()

# What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [None]:
repo['license_name'].value_counts()[:3]

# Which company do the majority of these developers work at?

In [None]:
users["company"].value_counts()[:1]

# Which programming language is most popular among these users?

In [None]:
repo["language"].value_counts()[:1]

# Which programming language is the second most popular among users who joined after 2020?

In [None]:
usersrepo = pd.merge(users.rename(columns={'created_at':'user_created_at'}), 
                        repo.rename(columns ={'created_at':'repo_created_at'}),
                         on='login', how ="left")
usersrepo[usersrepo['user_created_at']>'2020']['language'].value_counts()[:2]

# Which language has the highest average number of stars per repoitory?

In [None]:
repo.groupby('language')['stargazers_count'].mean().sort_values(ascending=False)[:1]

# Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [None]:
users['leader_strength'] = users['followers']/(1+users['following'])

In [None]:
users.sort_values(by = 'leader_strength', ascending=False)['login'][:5]

# What is the correlation between the number of followers and the number of public repoitories among users in Chicago?

In [None]:
users[['followers', 'public_repos']].dropna().corr().round(3)

# Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [None]:
# Prepare the independent variable (reshaped for sklearn)
X = users['public_repos'].values.reshape(-1, 1)

# Dependent variable
y = users['followers'].values

# Create the linear regression model
model = LinearRegression()

# Fit the model
model.fit(X, y)

# Get the slope (coefficient)
slope = model.coef_[0]

# Round the slope to 3 decimal places
slope_rounded = round(slope, 3)

print(f"Regression slope of followers on repos: {slope_rounded}")

# Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [14]:
repo[['has_projects', 'has_wiki']].dropna().corr().round(3)

Unnamed: 0,has_projects,has_wiki
has_projects,1.0,0.286
has_wiki,0.286,1.0


#  Do hireable users follow more people than those who are not hireable?

In [10]:
# Calculate average following for hireable users
# Calculate average following for non-hireable users
# Calculate the difference in averages
# Round the result to 3 decimal places

users['hireable'].fillna(np.nan, inplace =True)
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
nonhireable_avg_following = users[users['hireable'].isna()]['following'].mean()
average_difference = hireable_avg_following - nonhireable_avg_following

print(f"Average following per user for hireable=true minus for non-hireable: {average_difference.round(3)}")

Average following per user for hireable=true minus for non-hireable: 111.688


# Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)

In [None]:
#  Filter out rows with empty bios
users_with_bios = users[~users['bio'].isna()]

# Calculate the word count for each bio
users_with_bios['bio_word_count'] = users_with_bios['bio'].astype(str).apply(lambda x: len(x.split()))

# Prepare the independent and dependent variables
X = users_with_bios['bio_word_count'].values.reshape(-1, 1)  # Word count
y = users_with_bios['followers'].values  # Followers

# Perform linear regression
model = LinearRegression()
model.fit(X, y)

# Get the slope (coefficient)
slope = model.coef_[0]

print(f"Regression slope of followers on bio word count: {slope.round(3)}")

# Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [None]:
# Adjust datatype of column
repo['created_at'] = pd.to_datetime(repo['created_at'], utc =True)
# Filter for weekends (Saturday and Sunday)
repo['weekday'] = repo['created_at'].dt.weekday
weekend_repos = repo[repo['weekday'] >= 5]  # Saturday (5) and Sunday (6)

# Group by 'login' and count weekend-created repo, then sort in descending order
top_users = (
    weekend_repos.groupby('login')
    .size()
    .sort_values(ascending=False)
    .head(5)
    .index
    .tolist()
)

# Join top users' login names into a comma-separated string
top_users_str = ', '.join(top_users)
print(f"Top 5 users' login who created most repositories on weekends: {top_users_str}")

# Do people who are hireable share their email addresses more often?

In [None]:
# A fraction of users refers to a proportion or percentage of users within a specific subset of the entire user base.

# For example, if you're calculating the fraction of users who have provided an email address, 
# it means finding the ratio of users with an email to the total number of users.

In [4]:
hireable_with_email = set(users[(~users['email'].isna()) & (users['hireable']==True)]['login'])

In [5]:
non_hireable_with_email  = set(users[(~users['email'].isna()) & (~(users['hireable']==True))]['login'])


In [6]:
x  = len(hireable_with_email) / len(set(users[~users['email'].isna()]['login']))
y = len(non_hireable_with_email) / len(set(users[~users['email'].isna()]['login']))
round(x-y, 3)

-0.454

# Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [None]:
# Filter out empty names and split to get last words
users['surname'] = users['name'].str.strip().str.split().str[-1]

# Count occurrences of each surname
surname_counts = users['surname'].value_counts()

# Find the maximum count
max_count = surname_counts.max()

# Get all surnames that have the maximum count
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

print(f"Most common surname(s): {most_common_surnames}")