In [1]:
import pandas as pd

users_df = pd.read_csv('../users.csv')
repos_df = pd.read_csv('../repositories.csv')


In [2]:
top_5_followers = users_df.sort_values('followers', ascending=False).head(5)['login'].tolist()
top_5_followers

['ValentineFernandes',
 'kovidgoyal',
 'slidenerd',
 'aryashah2k',
 'coding-parrot']

In [3]:
earliest_users = users_df.sort_values('created_at', ascending=True).head(5)['login']
earliest_users

597            ivank
461    sandeepshetty
707              svs
312     nitinhayaran
649          nischal
Name: login, dtype: object

In [4]:
top_licenses = repos_df['license_name'].dropna().value_counts().head(3).index.tolist()
top_licenses

['MIT License', 'Apache License 2.0', 'Other']

In [5]:
users_df['company'] = users_df['company'].str.strip().str.upper().str.lstrip('@')
most_common_company = users_df['company'].value_counts().idxmax()
most_common_company

'MASAI SCHOOL'

In [6]:
most_popular_language = repos_df['language'].value_counts().idxmax()
most_popular_language

'JavaScript'

In [7]:
post_2020_users = users_df[users_df['created_at'] > '2020-01-01']
post_2020_repos = repos_df[repos_df['login'].isin(post_2020_users['login'])]
second_most_popular_language = post_2020_repos['language'].value_counts().nlargest(2).index[-1]
second_most_popular_language

'HTML'

In [8]:
language_avg_stars = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
language_avg_stars

'TSQL'

In [9]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leaders = users_df.sort_values('leader_strength', ascending=False).head(5)['login'].tolist()
top_5_leaders

['kovidgoyal', 'coding-parrot', 'gkcs', 'slidenerd', 'dmalvia']

In [10]:
followers_repos_corr = users_df['followers'].corr(users_df['public_repos'])
followers_repos_corr 

0.03473876681266825

In [11]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(users_df[['public_repos']], users_df['followers'])
followers_per_repo = model.coef_[0]
followers_per_repo

0.1014452189397404

In [12]:
projects_wiki_corr = repos_df['has_projects'].corr(repos_df['has_wiki'])
projects_wiki_corr

0.16207650017644518

In [13]:
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()
avg_following_non_hireable = users_df[users_df['hireable'] == False]['following'].mean()
following_diff = avg_following_hireable - avg_following_non_hireable
following_diff

nan

In [14]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load the users data
users_df = pd.read_csv('../users.csv')

# Calculate the length of bios
users_df['bio_length'] = users_df['bio'].dropna().apply(len)

# Filter out users without bios
filtered_df = users_df[users_df['bio_length'].notna()]

# Prepare the data for regression
X = filtered_df[['bio_length']]
y = filtered_df['followers']

# Perform linear regression
model = LinearRegression()
model.fit(X, y)

# Get the regression slope
regression_slope = model.coef_[0]

# Output the result
print(f"Regression slope of followers on bio length: {regression_slope:.3f}")


Regression slope of followers on bio length: -0.106


In [15]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['is_weekend'] = repos_df['created_at'].dt.weekday >= 5
weekend_repos = repos_df[repos_df['is_weekend']]
top_5_weekend_users = weekend_repos['login'].value_counts().head(5).index.tolist()
top_5_weekend_users

['Kushal334', 'alokproc', 'patilswapnilv', 'rajeshpillai', 'deadcoder0904']

In [16]:
import pandas as pd

# Load the users data
users_df = pd.read_csv('../users.csv')

# Count users with emails when hireable = True
hireable_users = users_df[users_df['hireable'] == True]
hireable_with_email = hireable_users['email'].notna().sum()
hireable_fraction = hireable_with_email / len(hireable_users) if len(hireable_users) > 0 else 0

# Count users with emails when hireable = False
non_hireable_users = users_df[users_df['hireable'] == False]
non_hireable_with_email = non_hireable_users['email'].notna().sum()
non_hireable_fraction = non_hireable_with_email / len(non_hireable_users) if len(non_hireable_users) > 0 else 0

# Calculate the difference
email_difference = hireable_fraction - non_hireable_fraction

# Output the result
print(f"Difference in email sharing: {email_difference:.3f}")


Difference in email sharing: 0.595


In [17]:
users_df['surname'] = users_df['name'].dropna().apply(lambda x: x.strip().split()[-1])
most_common_surname = users_df['surname'].value_counts().nlargest(1).index.tolist()
most_common_surname

['Singh']