# A Basic EDA on Coursera Dataset
*Here I have done some basic viz and exploration so to get you going with the dataset. <br> Please be comfortable to make your own analysis. This is just an example.*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set();

plt.rcParams['figure.figsize'] = 20, 10
plt.style.use('dark_background')
plt.rcParams['axes.grid'] = False

In [None]:
df = pd.read_csv('../input/python-courses-on-coursera/python_courses_coursera.csv').drop("Unnamed: 0", axis=1)

In [None]:
pd.options.display.max_rows = 100

## Cleaning

In [None]:
df

# <center> Simple Questions </center>

### #configurations

In [None]:
from matplotlib import cm

def get_color_spectrum(start, end, n, flipped= False):
    if flipped:
        return cm.inferno_r(np.linspace(start, end, n))[::-1]
    return cm.inferno_r(np.linspace(start, end, n))

In [None]:
def plot_bar_values(plot, values, xoffset= 1, yoffset= 0.2, type_ = 'v', fontdict= None, **kwargs):
    '''Parameters:
    --------------
    • plot: 
    x = plt.plot()
    Then `x` becomes the `plot`
    
    • values: series.values
    
    • xoffset, yoffset - Self explanatory
    
    • type_: This should be 'h' or 'v' depending on the type of your plot.
    '''
    for patch, val in zip(plot.patches, values):
        x = patch.get_width() if type_ == 'h' else patch.get_x()
        y = patch.get_y() if type_ == 'h' else patch.get_height()
        plt.text(x + xoffset, y + yoffset, str(val), fontdict= fontdict, **kwargs)

# 

## Q1. Which university is providing most of the courses?

In [None]:
topUni = df.university.value_counts()[:10][::-1]
plot = plt.barh(topUni.index, topUni.values, color= get_color_spectrum(.4, .6, 10, flipped= True))
plot_bar_values(plot, topUni.values, type_= 'h', xoffset= 5)

Here we can see that for the most of the cases, we are having **Courrsera Project Network** as a leading source of the program in the field of python. But, when I tried to look at - Coursera project network provides the "GUIDED PROJECT". Which is a type of the program. If we are just interested in the courses like specialization and certifications, then this graph is helpful.

In [None]:
topUni = df[df.type != 'GUIDED PROJECT'].university.value_counts()[:10][::-1]
plot = plt.barh(topUni.index, topUni.values, color= get_color_spectrum(.4, .6, 10, flipped= True))
plot_bar_values(plot, topUni.values, type_= 'h')

# 

## Q2. How courses are divided (according to their type)?

In [None]:
courseType = df.type.value_counts()

In [None]:
courseType = df.type.value_counts()
plot = plt.bar(courseType.index, courseType.values, color= get_color_spectrum(.1, .5, len(courseType), flipped= True))
plot_bar_values(plot, courseType.values, xoffset= 0.35, yoffset= 5)

So, yea - coursera offers various 'COURSES' on python. The specialization seems to be on the third.<br>
Let's see which university from top10 offers what.

In [None]:
topUni.index[:-1]
topUniDF = df[df.university.isin(topUni.index[:-1])]

In [None]:
sns.countplot(y= 'university', data= topUniDF, hue= 'type', palette= ['r', 'g', 'b', 'y'], saturation= 1)
plt.legend(loc= 1)

Looking at the data, we can see that - only two universities 'HSE Uni' and 'Uni of Michigan' provides Degrees on python. And most of the universities focuses more on the COURSES.

### What about bottom 10 universities?

In [None]:
bottomUni = df[df.type != 'GUIDED PROJECT'].university.value_counts()[-70::3][::-1]
plot = plt.barh(bottomUni.index, bottomUni.values, color= get_color_spectrum(0, 1, 70//3 + 1, flipped= True))
plot_bar_values(plot, bottomUni.values, type_= 'h', xoffset= 0.1)

In [None]:
bottomUniDF = df[df.university.isin(bottomUni.index)]
sns.countplot(x= 'university', data= bottomUniDF, hue= 'type', palette= ['r', 'g', 'b', 'y'], saturation= 1)
plt.xticks(rotation= 90)
plt.legend(loc= 1)

Not much to get.

# 

# Q3. Universities ranked by their number of votes.

In [None]:
uniRanked = df.groupby("university")[['review', 'votes']].agg({"review": "mean", "votes": "sum"}).sort_values(by= "votes", ascending= False)
uniRanked.dropna(inplace= True)
uniRanked

In [None]:
topRankers = uniRanked[:10]
bottomRankers = uniRanked[-10:]

In [None]:
both = pd.concat([topRankers, bottomRankers])[::-1]
both

In [None]:
fig, ax = plt.subplots(1, 1, figsize= (20, 15))


ax.barh(both.index, 6, color= 'grey', alpha= 0.3)
ax.barh(both.index, both.review)
ax.set_yticklabels([])
for side in ['right', 'top', 'bottom']:
    ax.spines[side].set_visible(False)
for patch in ax.patches[20:25]:
    patch.set_color((1, 0, 0, 0.5))
for patch in ax.patches[35:]:
    patch.set_color((0, 1, 0, 0.5))
for patch in ax.patches[25:35]:
    patch.set_color("grey")

    
for patch, val, names in zip(ax.patches[20:], both.votes, both.index):
    x = patch.get_width()
    y = patch.get_y()
    ax.text(x + 0.1, y + 0.25, str(int(val)), fontfamily= 'consolas')
    ax.text(x - 0.1, y + 0.25, str(names), ha= 'right', fontfamily= "product sans")
    

# 

# Q4. Which university has most of the students?

In [None]:
df = df.rename({"studets": "students"}, axis= 1)

In [None]:
uniStud = df.groupby("university")['students'].sum().sort_values(ascending= False).round(1)

In [None]:
unis = uniStud[::2]

In [None]:
plt.bar(unis.index, unis.values);
plt.xticks(rotation= 90);
plt.hlines(uniStud.mean(), xmin= 0, xmax= uniStud.index[-1], ls= '--', lw= 2)
plt.annotate("Mean: 518092", xy= ("Intel", 518092), xytext= (30, 2018092),
             arrowprops= dict(arrowstyle= "->", connectionstyle= "arc3, rad= -0.2"));

# Q5. Number of courses based on difficulty

Right off the bat, we can say that most of the programs are Intermediate. Which takes `51%` of all the rest levels.

In [None]:
fig = plt.figure()
ax = fig.add_axes([1,1,1,1])
ax2 = fig.add_axes([1.5,1.5,0.5,0.5])
sns.countplot(x= "difficulty", data= df, hue= 'type', ax = ax)

diff = df.difficulty.value_counts()
plot = ax2.bar(diff.index, diff.values)
ax2.set_yticks([])
plot_bar_values(plot, diff.values, xoffset= 0.38, yoffset= -35, fontdict={"fontfamily": "product sans", "size": 20}, ha= 'center')

# 

# <center> A bit advanced questions </center>

## Q1. How many cources teach AI / ML / DL (Which of course are related with python) ?

In [None]:
import re

In [None]:
pattern = r'(\bA\.?I\.?\b)|(\bM\.?L\.?\b)|(\bD\.?L\.?\b)|(\bN\.?L\.?P\.?\b)(Artificial Intelligence)|(Machine Learning)|(Deep Learning)|(Reinforcement Learning)|(Tensor\s?Flow)|(Natural Language Processing)|(Neural Networks?)'

In [None]:
with_MlAiDl = df[df.course.str.match(pattern, flags= re.IGNORECASE)]

In [None]:
with_MlAiDl

In [None]:
plot = with_MlAiDl.university.value_counts()[::-1].plot(kind= 'barh', color= get_color_spectrum(0.1, 0.5, 14))
plot_bar_values(plot, with_MlAiDl.university.value_counts()[::-1].values, type_= 'h', xoffset= 0.2, yoffset= 0.1)

# 

## Q2. Plot pie chart of those top 5 universities which give the programs on AI/ML/DL and show the distribution of their students with rest of their courses

In [None]:
top5_AI = with_MlAiDl.university.value_counts().index[:5]
top5_AI

In [None]:
top5_AI_DF = with_MlAiDl[with_MlAiDl.university.isin(top5_AI)]

In [None]:
top5_other_DF = df[df.university.isin(top5_AI) & ~(df.index.isin(top5_AI_DF.index))]

In [None]:
top5_AI_DF.groupby("university")['students'].sum()

In [None]:
top5_other_DF.groupby("university")['students'].sum()

In [None]:
AI_vs_REST = pd.DataFrame({"AI": top5_AI_DF.groupby("university")['students'].sum(), "REST": top5_other_DF.groupby("university")['students'].sum()})
AI_vs_REST

In [None]:
patches, _, __ = ax.pie(AI_vs_REST.iloc[0], colors= get_color_spectrum(.6, .8, 2), autopct= "%.2f%%")

In [None]:
fig = plt.figure(figsize= (20, 20))

ax1 = fig.add_axes([0, 0.5, 0.3, 0.3])
ax2 = fig.add_axes([0.3, 0.5, 0.3, 0.3])
ax3 = fig.add_axes([0.6, 0.5, 0.3, 0.3])

ax4 = fig.add_axes([0.15, 0.2, 0.3, 0.3])
ax5 = fig.add_axes([0.45, 0.2, 0.3, 0.3])

for idx, ax in enumerate([ax1, ax2, ax3, ax4, ax5]):
    ax.set(xticks= [], yticks= [])
    patches, _, __ = ax.pie(AI_vs_REST.iloc[idx], colors= get_color_spectrum(.6, .8, 2), autopct= "%.2f%%")
    ax.set_xlabel(AI_vs_REST.index[idx])
ax2.legend(patches, ["AI", "REST"], loc=9);

2 out of 5 universities have students more in their respective AI courses, than the other courses, while other 3 universities (which are also in top 5 for AI course - not to forget that) have surprisingly low amount of students in AI than their rest of their courses.

# 

# Q3. Top 50 courses which have got highest votes, are having which terms in their title?

In [None]:
import string
from nltk.corpus import stopwords
import re

In [None]:
puncs = string.punctuation
puncs

In [None]:
pattern = r'[!"#$%&\'()*+,-./:;<=>?@\[\]^_`{|}~]'

In [None]:
top50_terms = df.iloc[df.votes.sort_values(ascending= False).index][:50]

In [None]:
def remove(str_):
    return re.sub(pattern, '', str_)

In [None]:
top50_terms.course = top50_terms.course.apply(remove)

In [None]:
terms = top50_terms.course.str.lower().str.get_dummies(" ")

In [None]:
stopwords = stopwords.words("english")

In [None]:
stopwords.index("it")

In [None]:
stopwords.pop(26)

In [None]:
valids = terms.columns[~(terms.columns.str.lower().isin(stopwords))]

In [None]:
terms = terms.loc[:, valids]

In [None]:
most_used = terms.sum(axis= 0).sort_values(ascending= False)

In [None]:
ax = plt.axes()
_, _, autopects = ax.pie(most_used[:15], colors= get_color_spectrum(.2, .9, 15), labels=most_used.index.str.title()[:15], 
       autopct= "%.1f", pctdistance=.9)
my_circle=plt.Circle( (0,0), 0.8, color='black')
plt.setp(autopects, **{'color':'white', 'weight':'bold', 'fontsize':15.5})

ax.add_artist(my_circle)

# 

## Q4. Top 3 courses (by number of students) from each difficulty.

In [None]:
df.difficulty.unique()

In [None]:
Beginner = df[df.difficulty == 'Beginner'].sort_values(by= "students", ascending= False)[:3]
Beginner

In [None]:
Intermediate = df[df.difficulty == 'Intermediate'].sort_values(by= "students", ascending= False)[:3]
Intermediate

In [None]:
Mixed = df[df.difficulty == 'Mixed'].sort_values(by= "students", ascending= False)[:3].copy()
Mixed

In [None]:
Mixed.iloc[1, 1] = 'Programming for Everybody\n(Getting Started with Python)'

In [None]:
Advanced = df[df.difficulty == 'Advanced'].sort_values(by= "students", ascending= False)[:3]
Advanced

In [None]:
import matplotlib.lines as line

In [None]:
fig = plt.figure(figsize= (30, 10))
ax = plt.axes()
ax.set(xticks= [], yticks= [])
for side in ["left", 'right', 'bottom', 'top']:
    ax.spines[side].set_visible(False)

plt.text(0.45, 0.9, "Top 3 courses by difficulty level", ha= "center",
         fontfamily= "product sans", fontweight= 5, fontsize= 70)

l1 = line.Line2D([0.2, 0.8], [0.75, 0.75], transform=fig.transFigure, figure=fig, color = 'white', linestyle='-',linewidth = 3, alpha = 0.3)
fig.lines.extend([l1])


for course, x, color in zip(["Beginner", "Intermediate", "Mixed", "Advanced"], [0.1, 0.35, 0.60, 0.85], get_color_spectrum(0, .3, 4)):
    plt.text(x, -0.1, course, ha= "center",  fontfamily= "product sans", fontweight= 2, fontsize= 40, color= color)
    for row, ofset in zip(eval(course)[['course', 'university']].iterrows(), np.arange(0.2, 0.7, 0.2)):
        plt.text(x, ofset, row[1][0], ha= "center", fontfamily= "product sans", fontweight= 5, fontsize= 25, color= color)
        plt.text(x, ofset - 0.04, row[1][1], fontfamily= "product sans", fontweight= 5, fontsize= 15,  ha= "center", color= color)
        
l2 = line.Line2D([0.28, 0.28], [0.1, 0.70], transform=fig.transFigure, figure=fig, color = 'white', linestyle='-',linewidth = 3, alpha = 0.3)
l3 = line.Line2D([0.50, 0.50], [0.1, 0.70], transform=fig.transFigure, figure=fig, color = 'white', linestyle='-',linewidth = 3, alpha = 0.3)
l4 = line.Line2D([0.68, 0.68], [0.1, 0.70], transform=fig.transFigure, figure=fig, color = 'white', linestyle='-',linewidth = 3, alpha = 0.3)
fig.lines.extend([l2, l3, l4])

# 

# Q5. Universities review plot

In [None]:
tops = df.iloc[df.review.sort_values(ascending= False).index].copy()
tops.dropna(inplace= True)

In [None]:
ax = plt.axes()
for side in ["left", 'right', 'bottom', 'top']:
    ax.spines[side].set_visible(False)

plt.scatter(df.university, df.review)
plt.xticks([]);
for u, r in zip(tops.university[:30], tops.review[:30]):
    plt.text(u, r, str(u), rotation= 45)

# 

In [None]:
sns.heatmap(df.pivot_table(index= 'type', columns= 'difficulty', values= 'students'))

# 

# Q8. Most used terms in the course title

In [None]:
courses = df.course.apply(remove)

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
all_terms = []
courses.apply(lambda x: all_terms.extend(x.lower().split()))

In [None]:
all_terms = pd.Series(all_terms)
all_terms = all_terms[~(all_terms.isin(stopwords))]

In [None]:
all_terms.drop(all_terms[all_terms == "using"].index, inplace= True)

In [None]:
text = ' '.join(all_terms)

plt.rcParams['figure.figsize'] = (12,12)
wordcloud = WordCloud(background_color = 'black', colormap='rainbow', width = 1200,  height = 1080, max_words = 200).generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()