## EDA of reviews and ratings of Coursera courses

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# styles for seaborn
sns.set(style="ticks", palette="muted", color_codes=True)
sns.set_color_codes("pastel")

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

### First import the dataset and output 5 first rows.

In [None]:
# import the dataset which contains ratings (Label) and reviews (Review) grouped by Course ID
data = pd.read_csv("../input/reviews_by_course.csv")
data.head()

### Inspect the dataset. Looks like three reviews (Review) are missing and thre is no missing ratings (Label).
Fill missing reviews with empty string.

In [None]:
# inspect dataset for missing values
print(data.info())
# fill NaN with empty string
data = data.fillna("")
# inspect dataset again
print(data.info())

### There are 1835 unique courses in the dataset.

In [None]:
# unique course ID
len(data["CourseId"].unique())

### Let's find courses with the highest number of reviews/ratings for the course.
And print top20.

In [None]:
# find most popular courses
reviews_number = data["CourseId"].value_counts()
# show top20 courses by the reviews number
print(reviews_number[:20])

### Not surprising that Machine Learning by Andrew Ng has the most ratings. Perhaps, it is one of the best courses on Coursera!
### Let's see how ahead of other top20 courses it is.

In [None]:
# plot barplot
sns.barplot(y=reviews_number[:20].index, x=reviews_number[:20],color="b").set(xlabel="number of ratings", ylabel="Course ID")

### Time to find average ratings of the courses.
Showed distribution of the average rating using histogram.

In [None]:
# average rating of the course
average_rating = data.groupby("CourseId").mean().sort_values("Label", ascending=False)
# plot histogram
sns.distplot(average_rating, kde=False).set(xlabel="average rating", ylabel="number of courses")

Zoomed in x-axis to see the 3.8 - 5 range in details. 

In [None]:
# plot histogram for average rating >= 3.8
sns.distplot(average_rating[average_rating.Label >= 3.8], kde=False).set(xlabel="average rating", ylabel="number of courses",)

### 5.0, 4.0 and 3.0 seem to be outliers. I am wondering how many courses have average rating 5.0?
Turns out 239 or 13% of the total number of courses!

In [None]:
# number of courses with average rating 5.0
av_rating_5 = int(average_rating[average_rating.Label==5.0].count())
print("Number of courses with average rating 5.0:")
print(av_rating_5)
print("% of the total number of courses:")
print(av_rating_5/len(data["CourseId"].unique())*100)

### The courses with average rating 5.0 have only from 1 to 32 ratings with median number of ratings equal to 3.

In [None]:
# extract review numbers for courses with average rating 5.0
df_av_rating_5 = pd.DataFrame(reviews_number)[average_rating.Label==5.0]
# explore statistics
print(df_av_rating_5.describe())

### Let's turn to reviews and find the average number of characters in the review.
Showed the distribution of the average length using histogram.

In [None]:
# compute the number of characters in the review
data["Review_len"] = data["Review"].str.len()
# compute the average number of characters in the review for every course
average_len = data.groupby("CourseId").mean().sort_values("Review_len", ascending=False)
# explore statistics
print(average_len.Review_len.describe())
#plot histogram
sns.distplot(average_len["Review_len"], kde=False).set(xlabel="average number of characters in review", ylabel="number of courses",)

### Put number of reviews, average rating and average length of the review together in one DataFrame.

In [None]:
# Merge datasets on index and create new DataFrame "analysis"
# first we will transform most_reviews from Series to DataFrame
df_reviews_number = pd.DataFrame(reviews_number)
# merge df_reviews_number and average_rating
analysis = pd.merge(df_reviews_number, average_rating,  right_index=True, left_index=True)
# transform average_len from Series to DataFrame
df_average_len = pd.DataFrame(average_len.Review_len)
# merge analysis and df_average_len
analysis = pd.merge(analysis, df_average_len,  right_index=True, left_index=True)
# rename columns
analysis.columns = ["reviews_number", "av_rating", "av_review_len"]
# show first 5 rows
analysis.head()

### Let's see whether there are any correlations between columns in the new DataFrame.
There is a weak negative correlation between the average rating and average review length.

In [None]:
# Let's first explore correlations 
analysis.corr()

### That is how the correlation looks on a scatter plot.

In [None]:
# plot scatter plot with av_review_len as x and av_rating as y
sns.scatterplot(x="av_review_len", y="av_rating", data = analysis).set(xlim=(0,500), xlabel="average number of characters in review", ylabel="average rating")

### Built linear model. 
This model corroborates the negative correlation between the average rating and average review length.

In [None]:
# Show scatter plot with linear model
sns.lmplot(x="av_review_len", y="av_rating", data = analysis).set(xlim=(-30, 500), ylim=(2,5.1), xlabel="average number of characters in review", ylabel="average rating")

## P. S.
### Do people use exclamation mark in the reviews?
On average 0.4 times per review, but there is one review with 421 exclamation marks!!!

In [None]:
# Count the number of "!" in the reviews
data["excl_num"] = data["Review"].str.count("!")
# Explote the statistics
print(data["excl_num"].describe())

### We have to find that review!!!
Apparently, the course "getting-started-with-essay-writing" did not help the student to improve his/her essay writing skills :)

In [None]:
print("Course Id:")
print(data.iloc[data["excl_num"].idxmax()]["CourseId"])
print("Review:")
print(data.iloc[data["excl_num"].idxmax()]["Review"])