In [2]:
# read in data
import pandas as pd

df = pd.read_csv('./music_data.csv')
len(df)

1000

In [3]:
# delete missing values
df = df.dropna()
len(df)

553

In [4]:
# delete unnecessary columns

df = df.drop(['Unnamed: 11', 'Featured Streams', 
                '#', 'Last Update', ], axis=1)

# look at data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 553 entries, 0 to 998
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Performer     553 non-null    object
 1   Lead_Streams  553 non-null    object
 2   Tracks        553 non-null    object
 3   1b+           553 non-null    object
 4   100m+         553 non-null    int64 
 5   10m+          553 non-null    int64 
 6   1m+           553 non-null    int64 
 7   min           553 non-null    object
 8   debut         553 non-null    object
 9   artist_type   553 non-null    object
dtypes: int64(3), object(7)
memory usage: 47.5+ KB


In [5]:
# get rid of commas so that I can convert to numeric
df = df.replace(',','', regex=True)


In [6]:
# drop rows that have date incorrectly in debut column
df = df[~df.debut.str.contains("/")]

In [7]:
# convert some columns to numeric
df.Lead_Streams = pd.to_numeric(df.Lead_Streams)
df['1b+'] = pd.to_numeric(df['1b+'])
df.debut = pd.to_numeric(df.debut)
df.Tracks = pd.to_numeric(df.Tracks)

df

Unnamed: 0,Performer,Lead_Streams,Tracks,1b+,100m+,10m+,1m+,min,debut,artist_type
0,Drake,42937631439,249,4,112,241,244,5/23/09,4569,solo
1,Ed Sheeran,32149951918,207,7,56,155,182,7/28/12,3407,solo
2,Bad Bunny,29409721272,141,2,81,134,140,1/12/19,1048,solo
3,Ariana Grande,28008187086,179,4,63,123,172,8/10/13,3029,solo
4,The Weeknd,27049238010,148,6,61,125,140,11/17/12,3295,solo
...,...,...,...,...,...,...,...,...,...,...
990,No Doubt,1127656163,120,0,2,13,53,12/16/95,9476,group
991,M.I.A.,1127631117,111,0,3,11,75,8/2/08,4863,solo
993,Deep Purple,1124412499,814,0,2,14,72,8/17/68,19458,group
997,Fat Joe,1121148790,224,0,3,18,60,10/9/93,10274,solo


In [9]:
# find means and sd's
import numpy as np

print('Lead Streams mean: ' + str(df.Lead_Streams.mean()))
print('Lead Streams sd: ' + str(df.Lead_Streams.std()))

print('Experience mean (days): ' + str(df.debut.mean()))
print('Experience sd (days): ' + str(df.debut.std()))


print('Tracks mean: ' + str(np.asarray(df.Tracks, dtype=np.float).mean()))
print('Tracks sd: ' + str(np.asarray(df.Tracks, dtype=np.float).std()))

#print('Tracks per year mean: ' + str(np.asarray(df.tracks_per_year, dtype=np.float).mean()))
#print('Tracks per year sd: ' + str(np.asarray(df.tracks_per_year, dtype=np.float).std()))

print('1b+ mean: ' + str(df['1b+'].mean()))
print('1b+ sd: ' + str(df['1b+'].std()))

print('100m+ mean: ' + str(df['100m+'].mean()))
print('100m+ sd: ' + str(df['100m+'].std()))

print('10m+ mean: ' + str(df['10m+'].mean()))
print('10m+ sd: ' + str(df['10m+'].std()))

print('1m+ mean: ' + str(df['1m+'].mean()))
print('1m+ sd: ' + str(df['1m+'].std()))

Lead Streams mean: 4513825140.451554
Lead Streams sd: 4950706901.209861
Experience mean (days): 6644.7239488117
Experience sd (days): 5829.866961285955
Tracks mean: 247.07861060329068
Tracks sd: 308.4643915275842
1b+ mean: 0.3491773308957952
1b+ sd: 0.970543378825046
100m+ mean: 9.784277879341865
100m+ sd: 11.924253057120985
10m+ mean: 46.725776965265084
10m+ sd: 35.21292809504895
1m+ mean: 116.61791590493601
1m+ sd: 69.26943657391507


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  print('Tracks mean: ' + str(np.asarray(df.Tracks, dtype=np.float).mean()))
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  print('Tracks sd: ' + str(np.asarray(df.Tracks, dtype=np.float).std()))


In [10]:
df.artist_type.value_counts()

solo     371
group    176
Name: artist_type, dtype: int64

In [11]:
df['tracks_per_year'] = df['Tracks'] / (df['debut']/365)

df

Unnamed: 0,Performer,Lead_Streams,Tracks,1b+,100m+,10m+,1m+,min,debut,artist_type,tracks_per_year
0,Drake,42937631439,249,4,112,241,244,5/23/09,4569,solo,19.891661
1,Ed Sheeran,32149951918,207,7,56,155,182,7/28/12,3407,solo,22.176402
2,Bad Bunny,29409721272,141,2,81,134,140,1/12/19,1048,solo,49.107824
3,Ariana Grande,28008187086,179,4,63,123,172,8/10/13,3029,solo,21.569825
4,The Weeknd,27049238010,148,6,61,125,140,11/17/12,3295,solo,16.394537
...,...,...,...,...,...,...,...,...,...,...,...
990,No Doubt,1127656163,120,0,2,13,53,12/16/95,9476,group,4.622203
991,M.I.A.,1127631117,111,0,3,11,75,8/2/08,4863,solo,8.331277
993,Deep Purple,1124412499,814,0,2,14,72,8/17/68,19458,group,15.269298
997,Fat Joe,1121148790,224,0,3,18,60,10/9/93,10274,solo,7.957952


In [14]:
# convert days of experience to years of experience
df.debut = df.debut/365

df

Unnamed: 0,Performer,Lead_Streams,Tracks,1b+,100m+,10m+,1m+,min,debut,artist_type,tracks_per_year
0,Drake,42937631439,249,4,112,241,244,5/23/09,12.517808,solo,19.891661
1,Ed Sheeran,32149951918,207,7,56,155,182,7/28/12,9.334247,solo,22.176402
2,Bad Bunny,29409721272,141,2,81,134,140,1/12/19,2.871233,solo,49.107824
3,Ariana Grande,28008187086,179,4,63,123,172,8/10/13,8.298630,solo,21.569825
4,The Weeknd,27049238010,148,6,61,125,140,11/17/12,9.027397,solo,16.394537
...,...,...,...,...,...,...,...,...,...,...,...
990,No Doubt,1127656163,120,0,2,13,53,12/16/95,25.961644,group,4.622203
991,M.I.A.,1127631117,111,0,3,11,75,8/2/08,13.323288,solo,8.331277
993,Deep Purple,1124412499,814,0,2,14,72,8/17/68,53.309589,group,15.269298
997,Fat Joe,1121148790,224,0,3,18,60,10/9/93,28.147945,solo,7.957952


In [15]:
# rename some columns
df = df.rename(columns={'debut': 'years_experience', 'min': 'debut'})

df

Unnamed: 0,Performer,Lead_Streams,Tracks,1b+,100m+,10m+,1m+,debut,years_experience,artist_type,tracks_per_year
0,Drake,42937631439,249,4,112,241,244,5/23/09,12.517808,solo,19.891661
1,Ed Sheeran,32149951918,207,7,56,155,182,7/28/12,9.334247,solo,22.176402
2,Bad Bunny,29409721272,141,2,81,134,140,1/12/19,2.871233,solo,49.107824
3,Ariana Grande,28008187086,179,4,63,123,172,8/10/13,8.298630,solo,21.569825
4,The Weeknd,27049238010,148,6,61,125,140,11/17/12,9.027397,solo,16.394537
...,...,...,...,...,...,...,...,...,...,...,...
990,No Doubt,1127656163,120,0,2,13,53,12/16/95,25.961644,group,4.622203
991,M.I.A.,1127631117,111,0,3,11,75,8/2/08,13.323288,solo,8.331277
993,Deep Purple,1124412499,814,0,2,14,72,8/17/68,53.309589,group,15.269298
997,Fat Joe,1121148790,224,0,3,18,60,10/9/93,28.147945,solo,7.957952


In [41]:
# building a  linear regression model
from sklearn.linear_model import LinearRegression

#
x = df.drop(['Performer', 'debut', 'Lead_Streams'],axis=1)
#
y = df.Lead_Streams

In [42]:
x

Unnamed: 0,Tracks,1b+,100m+,10m+,1m+,years_experience,artist_type,tracks_per_year
0,249,4,112,241,244,12.517808,solo,19.891661
1,207,7,56,155,182,9.334247,solo,22.176402
2,141,2,81,134,140,2.871233,solo,49.107824
3,179,4,63,123,172,8.298630,solo,21.569825
4,148,6,61,125,140,9.027397,solo,16.394537
...,...,...,...,...,...,...,...,...
990,120,0,2,13,53,25.961644,group,4.622203
991,111,0,3,11,75,13.323288,solo,8.331277
993,814,0,2,14,72,53.309589,group,15.269298
997,224,0,3,18,60,28.147945,solo,7.957952


In [43]:
x = pd.get_dummies(data = x, columns = ['artist_type'])

x

Unnamed: 0,Tracks,1b+,100m+,10m+,1m+,years_experience,tracks_per_year,artist_type_group,artist_type_solo
0,249,4,112,241,244,12.517808,19.891661,0,1
1,207,7,56,155,182,9.334247,22.176402,0,1
2,141,2,81,134,140,2.871233,49.107824,0,1
3,179,4,63,123,172,8.298630,21.569825,0,1
4,148,6,61,125,140,9.027397,16.394537,0,1
...,...,...,...,...,...,...,...,...,...
990,120,0,2,13,53,25.961644,4.622203,1,0
991,111,0,3,11,75,13.323288,8.331277,0,1
993,814,0,2,14,72,53.309589,15.269298,1,0
997,224,0,3,18,60,28.147945,7.957952,0,1


In [51]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2, random_state=801)

lm = LinearRegression()
lm.fit(x_train, y_train)
yhat_test = lm.predict(x_test)
mse_lm_tt = mean_squared_error(y_test, yhat_test)
print(r2_score(y_test, yhat_test))

0.9768353693163804


In [50]:
# let's see what's significant 
import statsmodels.api as sm

est = sm.OLS(y, x)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:           Lead_Streams   R-squared:                       0.978
Model:                            OLS   Adj. R-squared:                  0.978
Method:                 Least Squares   F-statistic:                     2993.
Date:                Mon, 06 Dec 2021   Prob (F-statistic):               0.00
Time:                        16:35:32   Log-Likelihood:                -11942.
No. Observations:                 547   AIC:                         2.390e+04
Df Residuals:                     538   BIC:                         2.394e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Tracks             1.133e+04    1.6e+0

In [57]:
x.mean()

Tracks               247.078611
1b+                    0.349177
100m+                  9.784278
10m+                  46.725777
1m+                  116.617916
years_experience      18.204723
tracks_per_year       19.107390
artist_type_group      0.321755
artist_type_solo       0.678245
dtype: float64

In [59]:
# make predictions

'''
Let's make a prediction based on the means for these top 1000 artists (547 after removing missing values)
'''

# define one new data instance
Xnew = [[247, 0.5,10,47,116,18,19,.5,.5]]
# make a prediction
ynew = lm.predict(Xnew)

int(ynew)

4783799506

## 4783799506 streams