In [1]:
import numpy as np
import pandas as pd
import datetime

from sklearn.linear_model import LinearRegression

In [2]:
df_user = pd.read_csv('./users.csv')
df_repo = pd.read_csv('./repositories.csv')

In [3]:
df_user.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,iam-veeramalla,Abhishek Veeramalla,RED HAT,"Hyderabad, India",,,"Keep learning, sharing and growing || Principa...",45,16184,1,2018-09-19T05:58:52Z
1,in28minutes,,IN28MINUTES,"Hyderabad, India",in28minutes@gmail.com,True,"Helping 1 Million Learners learn Programming, ...",102,14368,0,2015-09-05T14:09:58Z
2,stacksimplify,STACKSIMPLIFY,STACKSIMPLIFY,Hyderabad,stacksimplify@gmail.com,,"Best Selling Instructor on Udemy - 2,10,000 St...",47,3230,0,2019-03-07T11:25:23Z
3,thenaveensaggam,NAVEEN SAGGAM,HTTPS://WWW.UIBRAINS.COM,Hyderabad,thenaveensaggam@gmail.com,,Founder: UiBrains Technologies\r\nEnthusiastic...,43,2164,1,2017-02-18T18:44:26Z
4,MadhavBahl,MADHAV BAHL,MICROSOFT,"Hyderabad, India",madhavbahl10@gmail.com,True,The Lean Programmer | Software Engineer @Micro...,128,1589,1,2017-03-04T06:16:43Z


In [4]:
df_repo.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,iam-veeramalla,iam-veeramalla/ansible-zero-to-hero,2024-05-15T12:27:58Z,607,607,HTML,True,False,Apache License 2.0
1,iam-veeramalla,iam-veeramalla/argo-cd,2020-11-27T11:24:01Z,199,199,,True,True,Apache License 2.0
2,iam-veeramalla,iam-veeramalla/argo-perf-test,2022-05-25T11:51:37Z,60,60,,True,True,
3,iam-veeramalla,iam-veeramalla/argo-rollouts,2022-09-30T11:46:15Z,65,65,,True,False,Apache License 2.0
4,iam-veeramalla,iam-veeramalla/argo-rollouts-manager,2023-05-15T02:23:07Z,89,89,,True,True,Apache License 2.0


In [5]:
df_user['hireable'].fillna("", inplace=True)

In [6]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   login         503 non-null    object
 1   name          497 non-null    object
 2   company       317 non-null    object
 3   location      503 non-null    object
 4   email         263 non-null    object
 5   hireable      503 non-null    object
 6   bio           415 non-null    object
 7   public_repos  503 non-null    int64 
 8   followers     503 non-null    int64 
 9   following     503 non-null    int64 
 10  created_at    503 non-null    object
dtypes: int64(3), object(8)
memory usage: 43.4+ KB


In [7]:
df_repo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35185 entries, 0 to 35184
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   login             35185 non-null  object
 1   full_name         35185 non-null  object
 2   created_at        35185 non-null  object
 3   stargazers_count  35185 non-null  int64 
 4   watchers_count    35185 non-null  int64 
 5   language          25631 non-null  object
 6   has_projects      35185 non-null  bool  
 7   has_wiki          35185 non-null  bool  
 8   license_name      11749 non-null  object
dtypes: bool(2), int64(2), object(5)
memory usage: 1.9+ MB


In [8]:
df_user['created_at'] = pd.to_datetime(df_user['created_at'])

In [9]:
df_repo['created_at'] = pd.to_datetime(df_repo['created_at'])

# Q1

In [10]:
df_user.sort_values(by='followers', ascending=False)['login'].head()

0     iam-veeramalla
1        in28minutes
2      stacksimplify
3    thenaveensaggam
4         MadhavBahl
Name: login, dtype: object

iam-veeramalla,in28minutes,stacksimplify,thenaveensaggam,MadhavBahl

# Q2

In [11]:
df_user.sort_values(by='created_at', ascending=True)['login'].head()

89            shabda
28          sitaramc
484     bagwanpankaj
263    srikanthlogic
473      kulbirsaini
Name: login, dtype: object

shabda,sitaramc,bagwanpankaj,srikanthlogic,kulbirsaini

# Q3

In [12]:
df_repo['license_name'].value_counts().head(3)

license_name
MIT License           5942
Apache License 2.0    1973
Other                 1400
Name: count, dtype: int64

MIT License,Apache License 2.0,Other

# Q4

In [13]:
df_user['company'].value_counts().head(1)

company
MICROSOFT    17
Name: count, dtype: int64

IIIT HYDERABAD

# Q5

In [14]:
df_repo['language'].value_counts().head(1)

language
JavaScript    5628
Name: count, dtype: int64

JavaScript

# Q6

In [15]:
filtered_users = df_user[df_user['created_at'] > '2020-01-01']

In [16]:
joined_df = pd.merge(filtered_users, df_repo, on='login')

In [17]:
joined_df['language'].value_counts().head(2)

language
JavaScript    1622
HTML           777
Name: count, dtype: int64

HTML

# Q7

In [18]:
average_stars_per_language = df_repo.groupby('language')['stargazers_count'].mean()

In [19]:
average_stars_per_language.idxmax()

'Perl'

# Q8

In [20]:
df_leader_str = df_user.copy()

In [21]:
df_leader_str['leader_strength'] = df_leader_str['followers']/(1 + df_leader_str['following'])

In [22]:
df_leader_str.sort_values(by='leader_strength', ascending=False)['login'].head()

1        in28minutes
0     iam-veeramalla
2      stacksimplify
6      ashokitschool
3    thenaveensaggam
Name: login, dtype: object

in28minutes,iam-veeramalla,stacksimplify,ashokitschool,thenaveensaggam

# Q9

In [23]:
df_user['followers'].corr(df_user['public_repos'])

0.006259836233742388

# Q10

In [24]:
X = df_user[['public_repos']]
y = df_user['followers']

In [25]:
model = LinearRegression()
model.fit(X, y)

In [26]:
model.coef_[0]

0.0698094507828575

# Q11

In [27]:
df_repo['has_projects'].dropna().corr(df_repo['has_wiki'].dropna())

0.17339940645504281

# Q12

In [28]:
avg_following_hireable = df_user[df_user['hireable'] == True]['following'].mean()

In [29]:
avg_following_not_hireable = df_user[df_user['hireable'] != True]['following'].mean()

In [30]:
print(avg_following_hireable - avg_following_not_hireable)

33.676825809895576


# Q13

In [31]:
df_with_user_bio = df_user.copy()

In [32]:
df_with_user_bio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   login         503 non-null    object             
 1   name          497 non-null    object             
 2   company       317 non-null    object             
 3   location      503 non-null    object             
 4   email         263 non-null    object             
 5   hireable      503 non-null    object             
 6   bio           415 non-null    object             
 7   public_repos  503 non-null    int64              
 8   followers     503 non-null    int64              
 9   following     503 non-null    int64              
 10  created_at    503 non-null    datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int64(3), object(7)
memory usage: 43.4+ KB


In [33]:
df_with_user_bio.dropna(subset='bio', inplace=True)

In [34]:
df_with_user_bio['bio_length'] = df_with_user_bio['bio'].apply(lambda x: len(str(x).split(' ')))

In [52]:
X = df_with_user_bio[['bio_length']]
y = df_with_user_bio['followers']

In [53]:
model = LinearRegression()
model.fit(X, y)

In [54]:
model.coef_[0]

7.001164654840586

# Q14

In [39]:
weekend_repos = df_repo[(df_repo['created_at'].dt.dayofweek) >= 5]

In [40]:
weekend_repos.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
18,iam-veeramalla,iam-veeramalla/devops-project-ideas,2023-04-22 13:51:19+00:00,285,285,,True,True,MIT License
25,iam-veeramalla,iam-veeramalla/go-web-app,2024-07-06 18:04:42+00:00,142,142,HTML,True,False,Apache License 2.0
32,iam-veeramalla,iam-veeramalla/k8s-kyverno-argocd,2023-02-19 05:45:11+00:00,202,202,,True,True,
35,iam-veeramalla,iam-veeramalla/MERN-docker-compose,2024-08-31 17:17:07+00:00,82,82,JavaScript,True,True,
37,iam-veeramalla,iam-veeramalla/officeassistant,2020-07-25 07:35:13+00:00,42,42,JavaScript,True,True,


In [41]:
weekend_repos['login'].value_counts().head()

login
hemanth22       224
anjijava16      178
wahidKhan74     176
elevenpassin    160
Shekharrajak    142
Name: count, dtype: int64

hemanth22,anjijava16,wahidKhan74,elevenpassin,Shekharrajak

# Q15

In [42]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   login         503 non-null    object             
 1   name          497 non-null    object             
 2   company       317 non-null    object             
 3   location      503 non-null    object             
 4   email         263 non-null    object             
 5   hireable      503 non-null    object             
 6   bio           415 non-null    object             
 7   public_repos  503 non-null    int64              
 8   followers     503 non-null    int64              
 9   following     503 non-null    int64              
 10  created_at    503 non-null    datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int64(3), object(7)
memory usage: 43.4+ KB


In [43]:
hireable_users = len(df_user[df_user['hireable'] == True])

In [44]:
unhireable_users = len(df_user[df_user['hireable'] != True])

In [45]:
hireable_users_email = len(df_user[df_user['hireable'] == True].dropna(subset='email'))

In [46]:
unhireable_users_email = len(df_user[df_user['hireable'] != True].dropna(subset='email'))

In [47]:
print((hireable_users_email/hireable_users)-(unhireable_users_email/unhireable_users))

0.2564641510692957


# Q16

In [55]:
df_user_with_names = df_user.copy()

In [56]:
df_user_with_names.dropna(subset = 'name', inplace = True)

In [57]:
df_user_with_names['surname'] = df_user_with_names['name'].apply(lambda x: x.lower().strip().split()[-1])

In [60]:
print(max_count)

14


In [66]:
# Most common surname
surname_counts[surname_counts == max_count]

surname
kumar    14
Name: count, dtype: int64