# **COURSERA DATA**

## Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

<br>
<hr>
<br>

## Import data

In [None]:
col_name = ["id", "title", "organizer", "type", "rating", "difficulty", "students"]
df = pd.read_csv("../input/coursera-course-dataset/coursea_data.csv", header=0, names=col_name, index_col="id").sort_values("id")
df.head()

#### Convert students to integer

In [None]:
# Split df['student'] into number and value
s = pd.DataFrame()
s['number'] = pd.to_numeric(df['students'].str[:-1])
s['value'] = df['students'].str[-1]
s.head()

In [None]:
# Which type of values do we have?
s['value'].value_counts()

In [None]:
# Ok, simply convert 'k' to thousand and 'm' to million
s.loc[s['value']=='k', 'value'] = 1000 
s.loc[s['value']=='m', 'value'] = 1000000
s.head()

In [None]:
# Multiply number and value, convert to integer and assign it back to df['students']
df['students'] = pd.to_numeric(s['number']*s['value'], downcast='integer')
df['students'].head()

#### Info and description of DataFrame

In [None]:
df.info()

In [None]:
df.describe()

<br>
<hr>
<br>

## Top organizers

In [None]:
mask = df.organizer.value_counts() >= 10
top_organizers = df.organizer.value_counts()[mask]
top_organizers

In [None]:
top_organizers.plot(kind='barh', figsize=(14,6), title="Top Organizers")

#### Courses of a particular organizer

In [None]:
particular_organizer = "Google Cloud"
mask = df["organizer"] == particular_organizer
df[mask].sort_values(by='rating', ascending=False)

<br>
<hr>
<br>

## Certificate types

In [None]:
cert_types = df.type.value_counts()
cert_types

It is worth noting that on coursera platform, Professional certificate are made up of courses and/or specialization. Each specialization is also made up of courses. However, not all courses are inside a specialization or professional certificate, some courses are independent and stand alone.

In [None]:
mask= df["type"] == "PROFESSIONAL CERTIFICATE"
df[mask].sort_values(by='students', ascending=False)

<br>
<hr>
<br>

## Difficulty

In [None]:
df.difficulty.value_counts()

#### Advanced courses

In [None]:
mask = df["difficulty"] == "Advanced"
df[mask].sort_values(by='title', ascending=True)

<br>
<hr>
<br>

## Top rated courses

In [None]:
# 5 star
mask = df['rating']==5.0
df[mask]

<br>
<hr>
<br>

## Most popular courses

In [None]:
mask = df['students']>=500000
df[mask].sort_values(by='students', ascending=False)

<br>
<hr>
<br>

## Search for a keyword in title

In [None]:
keyword = 'Data Science'
mask = df['title'].str.find(keyword) != -1
df[mask]

<br>
<hr>
<br>