In [2]:
import pandas as pd

# Load the data
users_df = pd.read_csv("/content/drive/MyDrive/TDS project1/users.csv")
repos_df = pd.read_csv("/content/drive/MyDrive/TDS project1/repositories.csv")


In [3]:
# Q-1
df = pd.read_csv("/content/drive/MyDrive/TDS project1/users.csv")
top_users = df.sort_values(by="followers", ascending=False).head(5)
top_users_logins = ", ".join(top_users["login"])

print("Top 5 users by followers:", top_users_logins)


Top 5 users by followers: tarsius, aalmiray, marcoroth, klmr, MrNeRF


In [4]:
# Q-2
earliest_users = users_df.sort_values(by="created_at").head(5)
earliest_users_list = ", ".join(earliest_users["login"].tolist())
print("5 earliest registered users:", earliest_users_list)


5 earliest registered users: bennyzen, aalmiray, pvillega, tarsius, amaunz


In [5]:
# Q-3
licenses = repos_df["license_name"].dropna()
top_3_licenses = licenses.value_counts().head(3).index.tolist()
print("Top 3 licenses:", ", ".join(top_3_licenses))


Top 3 licenses: mit, apache-2.0, other


In [6]:
# Q-4

# Filter out empty company entries
filtered_users = users_df[users_df["company"].notna() & (users_df["company"] != "")]

# Count occurrences of each company
company_counts = filtered_users["company"].value_counts()

# Find the most common company
top_company = company_counts.idxmax()
print("Company with the majority of developers:", top_company)


Company with the majority of developers: UNIVERSITY OF BASEL


In [7]:
# Q-5
# Filter out entries with missing or empty language fields
filtered_repos = repos_df[repos_df["language"].notna() & (repos_df["language"] != "")]

# Count occurrences of each language
language_counts = filtered_repos["language"].value_counts()

# Find the most common language
top_language = language_counts.idxmax()
print("Most popular programming language:", top_language)

Most popular programming language: JavaScript


In [8]:
# Q-6
recent_users = users_df[users_df["created_at"] > "2020-12-31"]

# Get the logins of users who joined after 2020
recent_user_logins = recent_users["login"].tolist()

# Filter repositories for those created by recent users
recent_user_repos = repos_df[repos_df["login"].isin(recent_user_logins)]

# Filter out entries with missing or empty language fields
recent_user_repos = recent_user_repos[recent_user_repos["language"].notna() & (recent_user_repos["language"] != "")]

# Count occurrences of each language
language_counts = recent_user_repos["language"].value_counts()

# Get the second most common language
second_most_popular_language = language_counts.index[1]
print("Second most popular programming language among users who joined after 2020:", second_most_popular_language)


Second most popular programming language among users who joined after 2020: HTML


In [9]:
# Q-7

# Filter out entries with missing or empty language fields
repos_df = repos_df[repos_df["language"].notna() & (repos_df["language"] != "")]

# Group by language and calculate the average stargazers_count for each language
average_stars_per_language = repos_df.groupby("language")["stargazers_count"].mean()

# Find the language with the highest average stars
highest_avg_star_language = average_stars_per_language.idxmax()
highest_avg_star_count = average_stars_per_language.max()

print("Language with the highest average number of stars per repository:", highest_avg_star_language)
print("Average number of stars:", highest_avg_star_count)


Language with the highest average number of stars per repository: PureScript
Average number of stars: 114.0


In [10]:
# Q-8

# Calculate leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and select the top 5 users
top_leader_strength_users = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Get the 'login' values of the top 5 users as a comma-separated string
top_5_logins = ", ".join(top_leader_strength_users['login'])

print("Top 5 users in terms of leader_strength:", top_5_logins)


Top 5 users in terms of leader_strength: dpryan79, wasserth, ravage84, elanmart, quadbiolab


In [11]:
# Q-9
# Calculate the correlation between followers and public_repos
correlation = users_df['followers'].corr(users_df['public_repos'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between followers and public repositories: {correlation:.3f}")


Correlation between followers and public repositories: 0.345


In [12]:
# Q-10
from sklearn.linear_model import LinearRegression



# Reshape the data for sklearn
X = users_df['public_repos'].values.reshape(-1, 1)
y = users_df['followers'].values

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Get the slope (coefficient) which indicates additional followers per additional public repository
slope = model.coef_[0]

# Print the slope rounded to 3 decimal places
print(f"Regression slope of followers on repos: {slope:.3f}")


Regression slope of followers on repos: 0.674


In [13]:
# Q-11
if repos_df['has_projects'].dtype == 'object':
    repos_df['has_projects'] = repos_df['has_projects'].map({'true': True, 'false': False})
if repos_df['has_wiki'].dtype == 'object':
    repos_df['has_wiki'] = repos_df['has_wiki'].map({'true': True, 'false': False})

correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

print(round(correlation, 3))


0.308


In [14]:
# Q12


# Convert the hireable column to boolean values (True for 'true', False for 'false')
users_df['hireable'] = users_df['hireable'].apply(lambda x: True if x == 'true' else False)

# Calculate the average number of people followed by hireable users
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()

# Calculate the average number of people followed by non-hireable users
avg_following_non_hireable = users_df[users_df['hireable'] == False]['following'].mean()

# Calculate the difference
difference = avg_following_hireable - avg_following_non_hireable

# Print the result, rounded to 3 decimal places
print(f"Average following difference (hireable - non-hireable): {difference:.3f}")


Average following difference (hireable - non-hireable): nan


In [15]:
# Q13
from sklearn.linear_model import LinearRegression
users_with_bio = users_df[(users_df['bio'].notna()) & (users_df['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
lr2.coef_[0]

0.3778362964269829

In [18]:
# Q-14
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('/content/drive/MyDrive/TDS project1/repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])

            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))

dpryan79,syzer,ioolkos,maysam,pvillega


In [17]:
# Q-15
fraction_hierable = users_df[users_df['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users_df[users_df['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

nan

In [19]:
# Q-16
new_users = users_df[users_df['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Arnold,Brand,Christensen,Fink,GmbH,Group,Guggisberg,Landolt,Roth,Tan
