In [1]:
import pandas as pd
import statsmodels.api as sm
from collections import Counter
users = pd.read_csv('users.csv')
top_5_users_by_followers = users[users['location'].str.contains('Austin', case=False)].nlargest(5, 'followers')
top_5_logins_by_followers = ','.join(top_5_users_by_followers['login'])
top_5_logins_by_followers

'getify,benawad,steveklabnik,cloudflare,jbogard'

In [2]:
users['created_at'] = pd.to_datetime(users['created_at'])
earliest_5_users = users[users['location'].str.contains('Austin', case=False)].nsmallest(5, 'created_at')
earliest_5_logins = ','.join(earliest_5_users['login'])
earliest_5_logins

'jnewland,joshknowles,hassox,jicksta,dan'

In [3]:
repos = pd.read_csv('repositories.csv')
popular_licenses = repos['license_name'].value_counts().nlargest(3).index.tolist()
','.join(popular_licenses)

'mit,apache-2.0,other'

In [4]:
company_counts = users['company'].str.upper().str.strip('@').value_counts()
most_common_company = company_counts.idxmax()
most_common_company

'GOOGLE'

In [5]:
most_popular_language = repos['language'].value_counts().idxmax()
most_popular_language

'JavaScript'

In [6]:
recent_users = users[users['created_at'] > '2020-01-01']
recent_repos = repos[repos['login'].isin(recent_users['login'])]
second_popular_language = recent_repos['language'].value_counts().nlargest(2).index[1]
second_popular_language

'HTML'

In [7]:
avg_stars_per_language = repos.groupby('language')['stargazers_count'].mean().idxmax()
avg_stars_per_language

'Fennel'

In [8]:
users['leader_strength'] = users['followers'] / (1 + users['following'])
top_5_leader_strength = users.nlargest(5, 'leader_strength')['login']
','.join(top_5_leader_strength)

'getify,cloudflare,benawad,oracle,ContinuumIO'

In [9]:
correlation_followers_repos = users['followers'].corr(users['public_repos'])
round(correlation_followers_repos, 3)


0.151

In [10]:
X = users['public_repos']
y = users['followers']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
regression_slope_followers_repos = model.params['public_repos']
round(regression_slope_followers_repos, 3)

4.104

In [11]:
projects_wiki_correlation = repos['has_projects'].corr(repos['has_wiki'])
round(projects_wiki_correlation, 3)

0.274

In [12]:
hireable_following_diff = users[users['hireable'] == True]['following'].mean() - users[users['hireable'] == False]['following'].mean()
round(hireable_following_diff, 3)

109.041

In [13]:
users['bio_word_count'] = users['bio'].fillna('').str.split().apply(len)
X_bio = users[users['bio_word_count'] > 0]['bio_word_count']
y_bio = users[users['bio_word_count'] > 0]['followers']
X_bio = sm.add_constant(X_bio)
bio_followers_model = sm.OLS(y_bio, X_bio).fit()
bio_followers_slope = bio_followers_model.params['bio_word_count']
round(bio_followers_slope, 3)

7.785

In [14]:
repos['created_at'] = pd.to_datetime(repos['created_at'])
repos['is_weekend'] = repos['created_at'].dt.weekday >= 5
weekend_repos = repos[repos['is_weekend']].groupby('login').size().nlargest(5).index
','.join(weekend_repos)

'FellowTraveler,realityexpander,OR13,PaulBratslavsky,skeptycal'

In [15]:
email_share_hireable = users[users['hireable'] == True]['email'].notna().mean()
email_share_not_hireable = users[users['hireable'] == False]['email'].notna().mean()
email_share_diff = round(email_share_hireable - email_share_not_hireable, 3)
email_share_diff

0.022

In [16]:
surname_counter = Counter()
for name in users['name'].fillna('').str.strip():
    if name:
        surname = name.split()[-1]
        surname_counter[surname] += 1
if surname_counter:
    max_count = max(surname_counter.values())
    most_common_surnames = [surname for surname, count in surname_counter.items() if count == max_count]
    most_common_surnames.sort()
    print(','.join(most_common_surnames))
else:
    print("No names found.")

Labs,Moore,Smith
