In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# If you like this EDA, please feel free to upvote it! I will appreciate the support! 

# Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.offline import iplot
from pandas_profiling import ProfileReport
from plotly.subplots import make_subplots
import itertools
from sklearn.linear_model import LinearRegression
import panel as pn
pn.extension("plotly")
import warnings
warnings.filterwarnings('ignore')

# Quick look at the data

In [None]:
df = pd.read_csv('/kaggle/input/coursera-course-dataset/coursea_data.csv', index_col=0)
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

# Exploratory data analysis

### **Most rated organizations by their courses**

In [None]:
highest_rating = df.nlargest(10, 'course_rating')[::-1]
fig = make_subplots(rows=1, cols=2, 
                    column_widths=[1, 0],
                    subplot_titles=['Top 10 Organizations by course rating', ''])
fig.append_trace(go.Bar(x=highest_rating['course_rating'],
                y=highest_rating['course_organization'],
                orientation='h',
                marker=dict(
                    color='#6DDE89',
                    line=dict(color='#6DDE89', width=1)
                ),
                        name=''
               ), 1,1
             )
fig.show()

<ul style="font-size:18px">
    <li>Universidad Austral and Imperial College London are the only two organizations that have the highest possible rating of 5.</li>
    <li>That doesn't mean that other organizations are "worse" than those ones, it may depend on the number of courses represented in this dataset.</li>
</ul>

### **Most popular courses by their certification type and difficulty**

In [None]:
course_type = df['course_Certificate_type'].value_counts().reset_index()
course_type

In [None]:
course_diff = df['course_difficulty'].value_counts().reset_index()
course_diff

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=('Number of courses by certification type', 'Number of courses by difficulty'))
trace0 = fig.add_trace(
    go.Bar(
                x = course_type['index'],
                y = course_type['course_Certificate_type'],
                name = '',
                marker = dict(color = '#EC5C42'),
                text=course_type['course_Certificate_type'],
),
    row=1, col=1
)
trace1 = fig.add_trace(
    go.Bar(
                x = df['course_difficulty'].unique(),
                y = course_diff['course_difficulty'],
                name = '',
                marker = dict(color = '#42ECB2'),
                text=course_diff['course_difficulty'],
    ),
    row=1, col=2
)
fig.update_traces(textposition='outside')
fig.update_layout(
    showlegend=False, 
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    title_font_size=22,
)
fig.show()

<ul style="font-size:18px">
    <li>It seems that a lot of users on this platform are just beginning their journey in the world of various topics, such as information technologies, mathematic and social sciences, data science, marketing, etc. 
    <li>That's why they prefer to choose courses for novices, which often include full roadmap to become a specialist in their field, rather than professional certificates oriented on solving practical problems.
</ul>

### **Number of courses on pie charts**

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "pie"}]],subplot_titles=('Number of courses by certification type', 'Number of courses by difficulty'))
trace0 = fig.add_trace(
    go.Pie(
                values = course_type['course_Certificate_type'],
                labels = course_type['index'],
),
    row=1, col=1
)
trace1 = fig.add_trace(
    go.Pie(
                values = course_diff['course_difficulty'],
                labels = course_diff['index'],
    ),
    row=1, col=2
)
fig.update_layout(legend=dict(orientation="h"))
fig.show()

<ul style="font-size:17px">
    <li>The most popular type of education on the Coursera platform is <i>Courses</i> for <i>Begginers</i>.</li>
    <li>On the contrary, <i>Professional certificates</i> and different courses for <i>Advanced</i> users are much less popular and they are not paid attention to.
</ul>

### **Distribution of courses ratings**

In [None]:
fig = ff.create_distplot([df['course_rating']],
                        ['courses'],
                         show_rug=False,
                         bin_size=0.05,
                         colors=['#6DDE89']
)
fig.add_vline(x=df['course_rating'].median(),
                  line_dash='dash',
                  line_color='#97398C',
                  line_width=5,
                  annotation_text='Median',
                 )
fig.add_vline(x=df['course_rating'].mean(),
                  line_dash='dash',
                  line_color='#455AB8',
                  line_width=5,
                  annotation_text='Mean'
                 )
fig.add_vline(x=df['course_rating'].mode().values[0],
                  line_dash='dash',
                  line_color='#C0D33F',
                  line_width=5,
                  annotation_text='Mode'
                 )
fig.update_layout(
        annotations=[dict(
          textangle=-60,
          y = 1.14,
          align='left',
        )],
        title_text='Ratings of courses',
        showlegend=False,
        paper_bgcolor='rgb(248, 248, 255)',
        plot_bgcolor='rgb(248, 248, 255)',
        title_font_size=22
    )

fig.show()

<ul style="font-size:18px">
    <li>This distribution is left-skewed. This means that most values would settle on the right part of the plot.</li>
    <li>The mean on the left-skewed distribution will be always lower than median</li>
    <li>These results are very optimistic! Only a few courses have rating less than 4 points, mostly, students like courses on this platform!</li>
</ul>

### **Scatter plots of ratings and students per type of courses and their difficulty**

In [None]:
df['students_in_thousands'] = df['course_students_enrolled'].apply(lambda x: float(x[:-1]))
df['students_in_thousands']

In [None]:
certificate_color = {'SPECIALIZATION': '#D93319', 'COURSE': '#D99619', 'PROFESSIONAL CERTIFICATE':'#000'}
cols = df['course_Certificate_type'].map(certificate_color)

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x = df['course_rating'],
        y = df['students_in_thousands'],
        mode='markers',     
        marker=dict(color=cols),
    )
)
fig.update_layout(
    title_text = 'Rating and students per certification type',
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    title_font_size=22
)
fig.show()

In [None]:
difficulty_color = {'Beginner': '#A122D2', 'Intermediate': '#D2227E', 'Mixed':'#000', 'Advanced':'#198CD9'}
cols = df['course_difficulty'].map(difficulty_color)

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x = df['course_rating'],
        y = df['students_in_thousands'],
        mode='markers',     
        marker=dict(color=cols),
    )
)
fig.update_layout(
    title_text = 'Rating and students per difficulty',
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    title_font_size=22
)
fig.show()

### **What is the proportions of courses difficulties per ratings?**

In [None]:
new_df = df.groupby(['course_rating', 'course_difficulty']).count().iloc[:, 0].reset_index()
new_df

In [None]:
fig = go.Figure()
px.bar(
    x=new_df['course_rating'],
    y=new_df['course_title'], 
    color=new_df['course_difficulty'],
    title='Distribution of courses difficulties per ratings'
    
)

<ul>
    <li></li>
</ul>