In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Numpy

For a detailed reference check out: https://numpy.org/doc/stable/reference/arrays.indexing.html.

## Creating numpy arrays

### Directly

In [None]:
a = np.array(
    [[1.0, 2.0, 3.0], 
     [4.0, 5.0, 6.0], 
     [7.0, 8.0, 9.0]]
)

print(a)

### From a list

In [None]:
a = [[1.0, 2.0, 3.0], 
     [4.0, 5.0, 6.0], 
     [7.0, 8.0, 9.0]]

print(a)
print()

a = np.array(a)

print(a)

### From a list comprehension

In [None]:
a = [i**2 for i in range(10)]

print(a)
print()
print(np.array(a))

### Ready-made functions in numpy

In [None]:
# All zeros
a = np.zeros((3, 4))
print("All zeros")
print(a)
print()

# All a chosen value
a = np.full((3, 4), 7.0)
print("All chosen value (variant 1)")
print(a)
print()

# or

a = np.zeros((3, 4))
a[:] = 7.0
print("All chosen value (variant 2)")
print(a)
print()

# Random integers

a = np.random.randint(low=0, high=10, size=(3, 2))
print("Random integers")
print(a)
print()

# Random values from the normal distribution (Gaussian)

print("Random values from the normal distribution")
a = np.random.normal(loc=0, scale=10, size=(3, 2))
print(a)

## Slicing numpy arrays

### Slicing in 1D

To obtain only specific values from a numpy array one can use so called slicing. It has the form

**arr[low:high:step]**

where low is the lowest index to be retrieved, high is the lowest index not to be retrieved and step indicates that every step element will be taken.

In [None]:
a = [i**2 for i in range(10)]

print("Original: ", a)
print("First 5 elements:", a[:5])
print("Elements from index 3 to index 5:", a[3:6])
print("Last 3 elements (negative indexing):", a[-3:])
print("Printed in reverse order:", a[::-1])
print("Every second element:", a[::2])

### Slicing in 2D

In two dimensions it works similarly, just the slicing is separate for every dimension.

In [None]:
a = np.array([i for i in range(25)]).reshape(5, 5)

print("Original: ")
print(a)
print()
print("First 2 elements of the first 3 row:")
print(a[:3, :2])
print()
print("Middle 3 elements from the middle 3 rows:")
print(a[1:4, 1:4])
print()
print("Bottom-right 3 by 3 submatrix (negative indexing):")
print(a[-3:, -3:])
print()
print("Reversed columns:")
print(a[:, ::-1])
print()

### Setting numpy array field values

In [None]:
a = np.array([i for i in range(25)]).reshape(5, 5)

print("Original: ")
print(a)
print()

a[1:4, 1:4] = 5.0

print("Middle values changed to 5")
print(a)
print()

b = np.array([i**2 - i for i in range(9)]).reshape(3, 3)

print("Second matrix")
print(b)
print()

a[1:4, 1:4] = b

print("Second matrix substituted into the middle of the first matrix")
print(a)

## Operations on numpy arrays

It is important to remember that arithmetic operations on numpy arrays are always element-wise.

In [None]:
a = np.array([i**2 for i in range(9)]).reshape((3, 3))
print(a)
print()

b = np.array([i**0.5 for i in range(9)]).reshape((3, 3))
print(b)
print()

### Element-wise sum

In [None]:
print(a + b)

### Element-wise multiplication

In [None]:
print(a * b)

### Matrix multiplication

In [None]:
print(np.matmul(a, b))
print()

# Multiplication by the identity matrix (to check it works as expected)
id_matrix = [[1.0, 0.0, 0.0], 
             [0.0, 1.0, 0.0], 
             [0.0, 0.0, 1.0]]

print(np.matmul(id_matrix, a))

### Calculating the mean

In [None]:
a = np.random.randint(low=0, high=10, size=(5))

print(a)
print()

print("Mean: ", np.sum(a) / len(a))

### Calculating the mean of every row

In [None]:
a = np.random.randint(low=0, high=10, size=(5, 3))

print(a)
print()

print("Mean:", np.sum(a, axis=1) / a.shape[1])

print("Mean in the original matrix form:")
print((np.sum(a, axis=1) / a.shape[1]).reshape(-1, 1))  # -1 calculates the right size to use all elements

### More complex operations

In [None]:
a = [1.0, 2.0, 3.0]

print("Vector to power 2 (element-wise)")
print(np.power(a, 2))
print()
print("Euler number to the power a (element-wise)")
print(np.exp(a))
print()
print("An even more complex expression")
print((np.power(a, 2) + np.exp(a)) / np.sum(a))

## Numpy tasks

**Task 1.** Calculate the sigmoid (logistic) function on every element of the following numpy array [0.3, 1.2, -1.4, 0.2, -0.1, 0.1, 0.8, -0.25] and print the last 5 elements. Use only vector operations.

In [None]:
# Write your code here

**Task 2.** Calculate the dot product of the following two vectors:<br/>
$x = [3, 1, 4, 2, 6, 1, 4, 8]$<br/>
$y = [5, 2, 3, 12, 2, 4, 17, 11]$<br/>
a) by using element-wise mutliplication and np.sum,<br/>
b) by using np.dot,<br/>
b) by using np.matmul and transposition (x.T).

In [None]:
# Write your code here

**Task 3.** Calculate the following expression<br/>
$$\frac{1}{1 + e^{-x_0 \theta_0 - \ldots - x_9 \theta_9 - \theta_{10}}}$$
for<br/>
$x = [1.2, 2.3, 3.4, -0.7, 4.2, 2.7, -0.5, -2.1, -3.3, 0.2]$<br/>
$\theta = [7.7, 0.33, -2.12, -1.73, 2.9, -5.8, -0.9, 12.11, 3.43, -0.5, 1.65]$<br/>
and print the result. Use only vector operations.

In [None]:
# Write your code here

# Pandas

## Load datasets

- Steam (https://www.kaggle.com/tamber/steam-video-games)

- MovieLens (https://grouplens.org/datasets/movielens/)

In [None]:
steam_df = pd.read_csv(os.path.join("data", "steam", "steam-200k.csv"), 
                       names=['user-id', 'game-title', 'behavior-name', 'value', 'zero'])

ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv"))
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv"))

## Peek into the datasets

In [None]:
steam_df.head(10)

In [None]:
ml_ratings_df.head(10)

In [None]:
ml_movies_df.head()

## Merge both MovieLens DataFrames into one

In [None]:
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='movieId')
ml_df.head(10)

## Choosing a row, a column or several columns

In [None]:
display(HTML(steam_df.head(10).to_html()))

# Choosing rows by index
chosen_df = steam_df[3:6]

print("Choosing rows by index")
display(HTML(chosen_df.head(10).to_html()))

# Choosing rows by position
chosen_df = steam_df.iloc[3:6]

print("Choosing rows by position")
display(HTML(chosen_df.head(10).to_html()))

In [None]:
# Choosing a column
chosen_df = steam_df["game-title"]

print(chosen_df.head(10))

In [None]:
# Choosing several columns
chosen_df = steam_df[['user-id', 'game-title']]

display(HTML(chosen_df.head(10).to_html()))

### Splitting the dataset into training and test set

In [None]:
shuffle = np.array(list(range(len(steam_df))))

# alternatively

shuffle = np.arange(len(steam_df))

np.random.shuffle(shuffle)
shuffle = list(shuffle)
print("Shuffled range of indices")
print(shuffle[:20])
print()

train_test_split = 0.8
split_index = int(len(steam_df) * train_test_split)

training_set = steam_df.iloc[shuffle[:split_index]]
test_set = steam_df.iloc[shuffle[split_index:]]

display(HTML(training_set.head(10).to_html()))

display(HTML(test_set.head(10).to_html()))

## Filtering

### Filtering columns

In [None]:
chosen_df = steam_df.loc[:, ['user-id', 'game-title']]

display(HTML(chosen_df.head(10).to_html()))

### Filtering rows

In [None]:
condition = steam_df['game-title'] == 'Fallout 4'

print(condition.head(10))

chosen_df = steam_df.loc[condition]

display(HTML(chosen_df.head(10).to_html()))

### Filtering rows and columns at once

In [None]:
condition = (steam_df['game-title'] == 'Fallout 4') & (steam_df['behavior-name'] == 'play')

chosen_df = steam_df.loc[condition, ['user-id', 'game-title', 'value']]

display(HTML(chosen_df.head(10).to_html()))

## Simple operations on columns

### Multiply a column by 2

In [None]:
steam_df_copy = steam_df.copy()

display(HTML(steam_df_copy.head(10).to_html()))

steam_df_copy.loc[:, 'value'] = steam_df_copy['value'] * 2

display(HTML(steam_df_copy.head(10).to_html()))

### Choose the first n letters of a string

In [None]:
ml_movies_df_copy = ml_movies_df.copy()

display(HTML(ml_movies_df_copy.head(10).to_html()))

ml_movies_df_copy.loc[:, 'title'] = ml_movies_df_copy['title'].str[:6]

display(HTML(ml_movies_df_copy.head(10).to_html()))

### Take the mean of a column

In [None]:
# Option 1
print(steam_df['value'].mean())

# Option 2
print(np.mean(steam_df['value']))

### Simple operation on filtered data

In [None]:
steam_df_copy = steam_df.loc[((steam_df['game-title'] == 'Fallout 4') | (steam_df['game-title'] == 'The Elder Scrolls V Skyrim')) 
                             & (steam_df['behavior-name'] == 'play')].copy()

display(HTML(steam_df_copy.head(10).to_html()))

condition = (steam_df_copy['game-title'] == 'Fallout 4') & (steam_df_copy['behavior-name'] == 'play')

steam_df_copy.loc[condition, 'value'] = steam_df_copy.loc[condition, 'value'] * 2

display(HTML(steam_df_copy.head(10).to_html()))

## Advanced operations on columns

In [None]:
def reduce_outliers(x):
    return min(np.log(1 + x), 4)

steam_df_copy = steam_df.loc[steam_df['behavior-name'] == 'play'].copy()

display(HTML(steam_df_copy.head(10).to_html()))

steam_df_copy.loc[:, 'value'] = steam_df_copy['value'].apply(reduce_outliers)

display(HTML(steam_df_copy.head(10).to_html()))

### The same apply operation can be achieved with the use of a lambda function

In [None]:
steam_df_copy = steam_df.loc[steam_df['behavior-name'] == 'play'].copy()

display(HTML(steam_df_copy.head(10).to_html()))

steam_df_copy.loc[:, 'value'] = steam_df_copy['value'].apply(lambda x: min(np.log(1 + x), 4))

display(HTML(steam_df_copy.head(10).to_html()))

### Apply on two columns at once

In [None]:
steam_df_copy = steam_df.loc[steam_df['behavior-name'] == 'play'].copy()

display(HTML(steam_df_copy.head(10).to_html()))

steam_df_copy.loc[:, 'value_2'] = steam_df_copy['value'].apply(lambda x: min(np.log(1 + x), 4))

display(HTML(steam_df_copy.head(10).to_html()))

steam_df_copy.loc[:, 'value'] = steam_df_copy[['value', 'value_2']].apply(lambda x: x[0] * x[1], axis=1)

display(HTML(steam_df_copy.head(10).to_html()))

In [None]:
ml_movies_df_copy = ml_movies_df.copy()

display(HTML(ml_movies_df_copy.head(10).to_html()))

ml_movies_df_copy.loc[:, 'title|genres'] = ml_movies_df_copy[['title', 'genres']].apply(lambda x: x[0] + "|" + x[1], axis=1)

display(HTML(ml_movies_df_copy.head(10).to_html()))

## Grouping and aggregating

### Find the most popular games (in terms of purchases)

In [None]:
steam_grouped = steam_df.loc[steam_df['behavior-name'] == 'purchase', ['game-title', 'value']]
steam_grouped = steam_grouped.groupby('game-title').sum()
steam_grouped = steam_grouped.sort_values(by='value', ascending=False).reset_index()

display(HTML(steam_grouped.head(10).to_html()))

## Iterating over a DataFrame (if possible, use column operations instead)

In [None]:
i = 0
for idx, row in steam_df.iterrows():
    print("[{}, {}, {}]".format(row['user-id'], row['game-title'], row['behavior-name']))
    i += 1
    if i == 10:
        break

## Pandas tasks - Steam dataset

**Task 4.** How many people made a purchase in the Steam dataset? Remember that a person could by many games, but you need to count every person once.

In [None]:
# Write your code here

**Task 5.** How many people made a purchase of "The Elder Scrolls V Skyrim"?

In [None]:
# Write your code here

**Task 6.** How many purchases people made on average?

In [None]:
# Write your code here

**Task 7.** Who bought the most games?

In [None]:
# Write your code here

**Task 8.** How many hours on average people played in "The Elder Scrolls V Skyrim"?

In [None]:
# Write your code here

**Task 9.** Which games were played the most (in terms of the number of hours played)? Print the first 10 titles and respective numbers of hours.

In [None]:
# Write your code here

**Task 10.** Which games are the most consistently played (in terms of the average number of hours played)? Print the first 10 titles and respective numbers of hours.

In [None]:
# Write your code here

**Task 11\*\*.** Fix the above for the fact that 0 hours played is not listed, but only a purchase is recorded in such a case.

In [None]:
# Write your code here

**Task 12.** Apply the sigmoid function
$$f(x) = \frac{1}{1 + e^{-\frac{1}{100}x}}$$
to hours played and print the first 10 rows from the entire Steam dataset after this change.

In [None]:
# Write your code here

## Pandas tasks - MovieLens dataset

**Task 13\*.** Calculate popularity (by the number of users who watched a movie) of all genres.

In [None]:
# Write your code here

**Task 14\*.** Calculate average rating for all genres.

In [None]:
# Write your code here

**Task 15.** Calculate each movie rating bias (deviation from the mean of all movies average ratings). Print first 10 in the form: title, average rating, bias.

In [None]:
# Write your code here

**Task 16.** Calculate each movie rating bias (deviation from the mean of all users average ratings). Print first 10 in the form: user_id, average rating, bias.

In [None]:
# Write your code here

**Task 17.** Randomly choose 10 movies and 10 users and print their interaction matrix in the form of a DataFrame with user_id as index and movie titles as columns (use HTML Display for that). You can iterate over the DataFrame in this task.

In [None]:
# Write your code here

## Pandas + numpy tasks

**Task 18.** Create the entire interaction matrix for the MovieLens dataset.

In [None]:
# Write your code here

**Task 19.** Calculate the matrix of size (n_users, n_users) where at position (i, j) is the number of movies watched both by user i and user j. Print the submatrix of first 10 rows and 10 columns.

In [None]:
# Write your code here

**Task 20.** Calculate the matrix of size (n_items, n_items) where at position (i, j) is the number of users who watched both movie i and movie j. Print the submatrix of first 10 rows and 10 columns.

In [None]:
# Write your code here