In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Numpy tasks

For a detailed reference check out: https://numpy.org/doc/stable/reference/arrays.indexing.html.

**Task 1.** Calculate the sigmoid (logistic) function on every element of the following numpy array [0.3, 1.2, -1.4, 0.2, -0.1, 0.1, 0.8, -0.25] and print the last 5 elements. Use only vector operations.

In [2]:
# Write your code here
array = np.array([0.3, 1.2, -1.4, 0.2, -0.1, 0.1, 0.8, -0.25])
print(array)
array = 1/(1 + np.exp(-array))
print(array)
print(np.exp(0.3))
print(1/(1 + np.exp(-0.3)))
temp = [0.3, 1.2]
temp2 = np.array(temp)
print(1/(1 + np.exp(-temp2)))

[ 0.3   1.2  -1.4   0.2  -0.1   0.1   0.8  -0.25]
[0.57444252 0.76852478 0.19781611 0.549834   0.47502081 0.52497919
 0.68997448 0.4378235 ]
1.3498588075760032
0.574442516811659
[0.57444252 0.76852478]


**Task 2.** Calculate the dot product of the following two vectors:<br/>
$x = [3, 1, 4, 2, 6, 1, 4, 8]$<br/>
$y = [5, 2, 3, 12, 2, 4, 17, 11]$<br/>
a) by using element-wise mutliplication and np.sum,<br/>
b) by using np.dot,<br/>
b) by using np.matmul and transposition (x.T).

In [3]:
# Write your code here
x = np.array([3,1,4,2,6,1,4,8])
y = np.array([5,2,3,12,2,4,17,11])
print(x * y)
# a
print(sum(x*y))
# b
print(np.dot(x, y))
# c
print(np.matmul(x, y.T))
print(np.matmul(x.T, y))
print(np.matmul(y,x))

[15  2 12 24 12  4 68 88]
225
225
225
225
225


**Task 3.** Calculate value of the logistic model<br/>
$$y = \frac{1}{1 + e^{-x_0 \theta_0 - \ldots - x_9 \theta_9 - \theta_{10}}}$$
for<br/>
$x = [1.2, 2.3, 3.4, -0.7, 4.2, 2.7, -0.5, 1.4, -3.3, 0.2]$<br/>
$\theta = [2.7, 0.33, -2.12, -1.73, 2.9, -5.8, -0.9, 12.11, 3.43, -0.5, -1.65]$<br/>
and print the result. Use only vector operations.

In [4]:
# Write your code here
x = np.array([1.2,2.3,3.4, -0.7,4.2,2.7, -0.5,1.4, -3.3,0.2, 1])
print(x)
theta = np.array([2.7,0.33, -2.12, -1.73,2.9, -5.8, -0.9,12.11,3.43, -0.5, -1.65])
print(theta)
power = -(x*theta)
print(power)
powerSum = sum(power)
print(powerSum)
y = 1/(1 + np.exp(powerSum))
print(y)

[ 1.2  2.3  3.4 -0.7  4.2  2.7 -0.5  1.4 -3.3  0.2  1. ]
[ 2.7   0.33 -2.12 -1.73  2.9  -5.8  -0.9  12.11  3.43 -0.5  -1.65]
[ -3.24   -0.759   7.208  -1.211 -12.18   15.66   -0.45  -16.954  11.319
   0.1     1.65 ]
1.1430000000000042
0.24176998326155683


**Task 4.** Calculate value of the multivariate linear regression model<br/>
$$y = A x + B$$
for<br/>
$A = \begin{bmatrix} 1 & 2 & 1 \\ 3 & 0 & 1 \end{bmatrix}$<br/>
$B = \begin{bmatrix} 0.2 \\ 0.3 \end{bmatrix}$<br/>
$x = [1, 2, 3]^T$<br/>
and print the result. Use only vector and matrix operations.

In [5]:
# Write your code here
a = np.array([[1, 2, 1], [3, 0, 1]])
print(a)
b = np.array([[0.2], [0.3]])
print(b)
x = np.array([1, 2, 3])
print(x)
temp = np.matmul(a, x.T)
print(temp)
y = temp + b
print(y)
yw = np.matmul(a, x.T) + b
print(yw)

[[1 2 1]
 [3 0 1]]
[[0.2]
 [0.3]]
[1 2 3]
[8 6]
[[8.2 6.2]
 [8.3 6.3]]
[[8.2 6.2]
 [8.3 6.3]]


# Pandas

## Load datasets

- Steam (https://www.kaggle.com/tamber/steam-video-games)

- MovieLens (https://grouplens.org/datasets/movielens/)

In [6]:
steam_df = pd.read_csv(os.path.join("data", "steam", "steam-200k.csv"), 
                       names=['user-id', 'game-title', 'behavior-name', 'value', 'zero'])

ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv"))
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv"))

## Merge both MovieLens DataFrames into one

In [7]:
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='movieId')
ml_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
5,18,1,3.5,1455209816,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
6,19,1,4.0,965705637,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
7,21,1,3.5,1407618878,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
8,27,1,3.0,962685262,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
9,31,1,5.0,850466616,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


## Pandas tasks - Steam dataset

**Task 5.** How many people made a purchase in the Steam dataset? Remember that a person could buy many games, but you need to count every person once.

In [8]:
# Write your code here
print(len(steam_df))
print(steam_df.loc('bevior-name' == 'purchase'))
chosen_df = steam_df['behavior-name']
display(chosen_df.head(10))
print(len(chosen_df))
#ch = chosen_df.loc(chosen_df['behavior-name'] == 'purchase')
#display(ch.head(10))
condition = steam_df['behavior-name'] == 'purchase'

print(condition.head(10))
chosen = steam_df.loc[condition]

#display(chosen.head(10))
ch = steam_df['user-id']
ch = ch.drop_duplicates()
display(ch.head(10))
print(len(ch))

200000
<pandas.core.indexing._LocIndexer object at 0x0000025744AC69F0>


0    purchase
1        play
2    purchase
3        play
4    purchase
5        play
6    purchase
7        play
8    purchase
9        play
Name: behavior-name, dtype: object

200000
0     True
1    False
2     True
3    False
4     True
5    False
6     True
7    False
8     True
9    False
Name: behavior-name, dtype: bool


0      151603712
66     187131847
68      59945701
133     53875128
835    234941318
836    140954425
838     26122540
855    176410694
857    197278511
859    150128162
Name: user-id, dtype: int64

12393


**Task 6.** How many people made a purchase of "The Elder Scrolls V Skyrim"?

In [9]:
# Write your code here
chooseColumns = steam_df[['behavior-name', 'game-title']]
condition = chooseColumns['behavior-name'] == 'purchase'
filteredRowsPurchase = chooseColumns.loc[condition]
display(filteredRowsPurchase.head(10))
conditionSkyrim = filteredRowsPurchase['game-title'] == 'The Elder Scrolls V Skyrim'
filteredRowsSkyrim = filteredRowsPurchase.loc[conditionSkyrim]
display(filteredRowsSkyrim.head(10))
print(len(filteredRowsSkyrim))

Unnamed: 0,behavior-name,game-title
0,purchase,The Elder Scrolls V Skyrim
2,purchase,Fallout 4
4,purchase,Spore
6,purchase,Fallout New Vegas
8,purchase,Left 4 Dead 2
10,purchase,HuniePop
12,purchase,Path of Exile
14,purchase,Poly Bridge
16,purchase,Left 4 Dead
18,purchase,Team Fortress 2


Unnamed: 0,behavior-name,game-title
0,purchase,The Elder Scrolls V Skyrim
72,purchase,The Elder Scrolls V Skyrim
793,purchase,The Elder Scrolls V Skyrim
1065,purchase,The Elder Scrolls V Skyrim
1167,purchase,The Elder Scrolls V Skyrim
1387,purchase,The Elder Scrolls V Skyrim
2064,purchase,The Elder Scrolls V Skyrim
2568,purchase,The Elder Scrolls V Skyrim
3232,purchase,The Elder Scrolls V Skyrim
3362,purchase,The Elder Scrolls V Skyrim


717


**Task 7.** How many purchases people made on average?

In [10]:
# Write your code here
ch = steam_df['user-id']
ch = ch.drop_duplicates()
numberOfPeople = len(ch)
print(numberOfPeople)

chooseColumns = steam_df[['behavior-name', 'game-title']]
condition = chooseColumns['behavior-name'] == 'purchase'
filteredRowsPurchase = chooseColumns.loc[condition]
#display(filteredRowsPurchase.head(10))
purchases = len(filteredRowsPurchase)
print(purchases)

print(purchases/numberOfPeople)

12393
129511
10.45033486645687


**Task 8.** Who bought the most games?

In [11]:
# Write your code here
playersWithBehavior = steam_df[['user-id', 'behavior-name']]
condition = playersWithBehavior['behavior-name'] == 'purchase'
idAndPurchase = playersWithBehavior.loc[condition]
display(idAndPurchase.head(10))
grouped = idAndPurchase.groupby('user-id').agg({'behavior-name' : 'count'}).sort_values('behavior-name', ascending = False)
display(grouped.head(10))
print(np.max(grouped['behavior-name']))

Unnamed: 0,user-id,behavior-name
0,151603712,purchase
2,151603712,purchase
4,151603712,purchase
6,151603712,purchase
8,151603712,purchase
10,151603712,purchase
12,151603712,purchase
14,151603712,purchase
16,151603712,purchase
18,151603712,purchase


Unnamed: 0_level_0,behavior-name
user-id,Unnamed: 1_level_1
62990992,1075
33865373,783
30246419,766
58345543,667
76892907,597
20772968,595
11403772,592
64787956,591
22301321,568
47457723,557


1075


**Task 9.** How many hours on average people played in "The Elder Scrolls V Skyrim"?

In [12]:
# Write your code here
chooseColumns = steam_df[['behavior-name', 'game-title', 'value']]
condition = chooseColumns['behavior-name'] == 'play'
filteredRowsPurchase = chooseColumns.loc[condition]
#display(filteredRowsPurchase.head(10))
conditionSkyrim = filteredRowsPurchase['game-title'] == 'The Elder Scrolls V Skyrim'
filteredRowsSkyrim = filteredRowsPurchase.loc[conditionSkyrim]
display(filteredRowsSkyrim.head(10))
numberOfPlayers = len(filteredRowsSkyrim)
print(numberOfPlayers)
meac = filteredRowsSkyrim['value'].mean()
print(meac)

Unnamed: 0,behavior-name,game-title,value
1,play,The Elder Scrolls V Skyrim,273.0
73,play,The Elder Scrolls V Skyrim,58.0
1066,play,The Elder Scrolls V Skyrim,110.0
1168,play,The Elder Scrolls V Skyrim,465.0
1388,play,The Elder Scrolls V Skyrim,220.0
2065,play,The Elder Scrolls V Skyrim,35.0
2569,play,The Elder Scrolls V Skyrim,14.6
3233,play,The Elder Scrolls V Skyrim,320.0
3363,play,The Elder Scrolls V Skyrim,80.0
3512,play,The Elder Scrolls V Skyrim,73.0


677
104.71093057607091


**Task 10.** Which games were played the most (in terms of the number of hours played)? Print the first 10 titles and respective numbers of hours.

In [13]:
# Write your code here
chooseColumns = steam_df[['user-id', 'behavior-name', 'game-title', 'value']]
condition = chooseColumns['behavior-name'] == 'play'
filtered = chooseColumns.loc[condition]
#display(filtered.head(10))
grouped = filtered.groupby('game-title').agg({'value' : 'sum'}).sort_values('value', ascending = False)
display(grouped.head(10))

Unnamed: 0_level_0,value
game-title,Unnamed: 1_level_1
Dota 2,981684.6
Counter-Strike Global Offensive,322771.6
Team Fortress 2,173673.3
Counter-Strike,134261.1
Sid Meier's Civilization V,99821.3
Counter-Strike Source,96075.5
The Elder Scrolls V Skyrim,70889.3
Garry's Mod,49725.3
Call of Duty Modern Warfare 2 - Multiplayer,42009.9
Left 4 Dead 2,33596.7


**Task 11.** Which games are the most consistently played (in terms of the average number of hours played)? Print the first 10 titles and respective numbers of hours.

In [14]:
# Write your code here
chooseColumns = steam_df[['user-id', 'behavior-name', 'game-title', 'value']]
#display(steam_df.head(10))
condition = chooseColumns['behavior-name'] == 'play'
filtered = chooseColumns.loc[condition]
#display(filtered.head(10))
grouped = filtered.groupby('game-title').agg({'value' : 'mean'}).sort_values('value', ascending = False)
display(grouped.head(10))

Unnamed: 0_level_0,value
game-title,Unnamed: 1_level_1
Eastside Hockey Manager,1295.0
Baldur's Gate II Enhanced Edition,475.255556
FIFA Manager 09,411.0
Perpetuum,400.975
Football Manager 2014,391.984615
Football Manager 2012,390.453165
Football Manager 2010,375.048571
Football Manager 2011,365.703226
Freaking Meatbags,331.0
Out of the Park Baseball 16,330.4


**Task 12\*\*.** Fix the above for the fact that 0 hours played is not listed, but only a purchase is recorded in such a case.

In [None]:
# Write your code here

**Task 13.** Apply the sigmoid function
$$f(x) = \frac{1}{1 + e^{-\frac{1}{100}x}}$$
to hours played and print the first 10 rows from the entire Steam dataset after this change.

In [15]:
# Write your code here
def sigmoid(x):
    return 1/(1 + (np.exp(-(x/100))))
print(sigmoid(100))
steam_df_copy = steam_df
steam_df_copy['value'] = steam_df_copy['value'].apply(sigmoid)
display(steam_df_copy.head(10))
#condition = steam_df_copy['behavior-name'] == 'play'
#filtered = steam_df_copy.loc[condition]
#filtered['value'] = filtered['value'].apply(sigmoid)
#display(filtered.head(10))

0.7310585786300049


Unnamed: 0,user-id,game-title,behavior-name,value,zero
0,151603712,The Elder Scrolls V Skyrim,purchase,0.5025,0
1,151603712,The Elder Scrolls V Skyrim,play,0.938774,0
2,151603712,Fallout 4,purchase,0.5025,0
3,151603712,Fallout 4,play,0.704746,0
4,151603712,Spore,purchase,0.5025,0
5,151603712,Spore,play,0.537181,0
6,151603712,Fallout New Vegas,purchase,0.5025,0
7,151603712,Fallout New Vegas,play,0.530213,0
8,151603712,Left 4 Dead 2,purchase,0.5025,0
9,151603712,Left 4 Dead 2,play,0.522235,0


## Pandas tasks - MovieLens dataset

**Task 14\*.** Calculate popularity (by the number of users who watched a movie) of all genres. Print a DataFrame with two columns: genre, n_users, where n_users contains the number of users who watched a given genre. Sort all genres in descending order.

In [None]:
# Write your code here
# po samych gatunkach można tu iterować pomimo że nei można po samym data frame'ie

**Task 15\*.** Calculate average rating for all genres. Print a DataFrame with two columns: genre, rating, where rating contains the average rating for a given genre. Sort all genres in descending order.

In [None]:
# Write your code here

**Task 17.** Calculate each movie rating bias (deviation from the mean of all movies average rating). Print first 10 in the form: title, average rating, bias.

In [None]:
# Write your code here
movies = ml_df
movies = movies[['userId', 'movieId', 'rating']]
movies['mean'] = 0
#grouped = idAndPurchase.groupby('user-id').agg({'behavior-name' : 'count'}).sort_values('behavior-name', ascending = False)
#grouped = movies
temp = movies.groupby('movieId').agg({'rating': 'mean'})
movies['mean'] = movies['mean'] - temp
#display(grouped.head(10))
display(movies.head(10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['mean'] = 0


**Task 17.** Calculate each user rating bias (deviation from the mean of all users average rating). Print first 10 in the form: user_id, average rating, bias.

In [None]:
# Write your code here

**Task 18.** Randomly choose 10 movies and 10 users and print their interaction matrix in the form of a DataFrame with user_id as index and movie titles as columns. You can iterate over the DataFrame in this task.

In [None]:
# Write your code here

## Pandas + numpy tasks

**Task 19.** Create the entire interaction matrix for the MovieLens dataset. Print the submatrix of first 10 rows and 10 columns.

In [None]:
# Write your code here

**Task 20.** Calculate the matrix of size (n_users, n_users) where at position (i, j) there is the number of movies watched both by user i and user j. Print the submatrix of first 10 rows and 10 columns.

In [None]:
# Write your code here

**Task 21.** Calculate the matrix of size (n_items, n_items) where at position (i, j) there is the number of users who watched both movie i and movie j. To prevent hanging your computer because of RAM shortage use only the first 1000 items. Print the submatrix of first 10 rows and 10 columns.

In [None]:
# Write your code here