In [1]:
import pandas as pd
import statsmodels.api as sm
from collections import Counter
users = pd.read_csv('users.csv')

In [2]:
#Who are the top 5 users in Austin with the highest number of followers? List their login in order, comma-separated.
top_5_users_by_followers = users[users['location'].str.contains('Austin', case=False)].nlargest(5, 'followers')
top_5_logins_by_followers = ','.join(top_5_users_by_followers['login'])
top_5_logins_by_followers

'getify,benawad,steveklabnik,cloudflare,jbogard'

In [3]:
#Who are the 5 earliest registered GitHub users in Austin? List their login in ascending order of created_at, comma-separated.
users['created_at'] = pd.to_datetime(users['created_at'])
earliest_5_users = users[users['location'].str.contains('Austin', case=False)].nsmallest(5, 'created_at')
earliest_5_logins = ','.join(earliest_5_users['login'])
earliest_5_logins

'jnewland,joshknowles,hassox,jicksta,dan'

In [4]:
#What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.
repos = pd.read_csv('repositories.csv')
popular_licenses = repos['license_name'].value_counts().nlargest(3).index.tolist()
','.join(popular_licenses)

'mit,apache-2.0,other'

In [5]:
#Which company do the majority of these developers work at?
company_counts = users['company'].str.upper().str.strip('@').value_counts()
most_common_company = company_counts.idxmax()
most_common_company

'GOOGLE'

In [6]:
#Which programming language is most popular among these users?
most_popular_language = repos['language'].value_counts().idxmax()
most_popular_language

'JavaScript'

In [7]:
#Which programming language is the second most popular among users who joined after 2020?
recent_users = users[users['created_at'] > '2020-01-01']
recent_repos = repos[repos['login'].isin(recent_users['login'])]
second_popular_language = recent_repos['language'].value_counts().nlargest(2).index[1]
second_popular_language

'HTML'

In [8]:
#Which language has the highest average number of stars per repository?
avg_stars_per_language = repos.groupby('language')['stargazers_count'].mean().idxmax()
avg_stars_per_language

'Fennel'

In [9]:
#Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.
users['leader_strength'] = users['followers'] / (1 + users['following'])
top_5_leader_strength = users.nlargest(5, 'leader_strength')['login']
','.join(top_5_leader_strength)

'getify,cloudflare,benawad,oracle,ContinuumIO'

In [10]:
#What is the correlation between the number of followers and the number of public repositories among users in Austin?
correlation_followers_repos = users['followers'].corr(users['public_repos'])
round(correlation_followers_repos, 3)


0.151

In [11]:
# Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.
#Regression slope of followers on repos 
X = users['public_repos']
y = users['followers']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
regression_slope_followers_repos = model.params['public_repos']
round(regression_slope_followers_repos, 3)

4.104

In [12]:
#Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?
#Correlation between projects and wiki enabled 
projects_wiki_correlation = repos['has_projects'].corr(repos['has_wiki'])
round(projects_wiki_correlation, 3)

0.274

In [13]:
#Do hireable users follow more people than those who are not hireable?
hireable_following_diff = users[users['hireable'] == True]['following'].mean() - users[users['hireable'] == False]['following'].mean()
round(hireable_following_diff, 3)

109.041

In [14]:
#13. Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)
users['bio_word_count'] = users['bio'].fillna('').str.split().apply(len)
X_bio = users[users['bio_word_count'] > 0]['bio_word_count']
y_bio = users[users['bio_word_count'] > 0]['followers']
X_bio = sm.add_constant(X_bio)
bio_followers_model = sm.OLS(y_bio, X_bio).fit()
bio_followers_slope = bio_followers_model.params['bio_word_count']
round(bio_followers_slope, 3)

7.785

In [15]:
# Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated
repos['created_at'] = pd.to_datetime(repos['created_at'])
repos['is_weekend'] = repos['created_at'].dt.weekday >= 5
weekend_repos = repos[repos['is_weekend']].groupby('login').size().nlargest(5).index
','.join(weekend_repos)

'FellowTraveler,realityexpander,OR13,PaulBratslavsky,skeptycal'

In [16]:
#Do people who are hireable share their email addresses more often?
email_share_hireable = users[users['hireable'] == True]['email'].notna().mean()
email_share_not_hireable = users[users['hireable'] == False]['email'].notna().mean()
email_share_diff = round(email_share_hireable - email_share_not_hireable, 3)
email_share_diff

0.022

In [17]:
#Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)
surname_counter = Counter()
for name in users['name'].fillna('').str.strip():
    if name:
        surname = name.split()[-1]
        surname_counter[surname] += 1
if surname_counter:
    max_count = max(surname_counter.values())
    most_common_surnames = [surname for surname, count in surname_counter.items() if count == max_count]
    most_common_surnames.sort()
    print(','.join(most_common_surnames))
else:
    print("No names found.")

Labs,Moore,Smith


In [20]:
"""Thankyou"""

'Thankyou'