# Data Analysis with Pandas

**Video Game Sales** 
Renad Al-khlafat
25/10/2021


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("vgsales.csv")
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


## Which company is the most common video game publisher?

In [3]:
most_common_publisher1 = df['Publisher']
most_common_publisher=most_common_publisher1.value_counts().idxmax()

## What’s the most common platform?

In [4]:
most_common_platform1 = df['Platform']
most_common_platform=most_common_platform1.value_counts().idxmax()

## What about the most common genre?

In [5]:
most_common_genre1 = df['Genre']
most_common_genre=most_common_genre1.value_counts().idxmax()

## What are the top 20 highest grossing games?

In [6]:
top_twenty_highest_grossing_games1 = df.set_index('Global_Sales')
top_twenty_highest_grossing_games=top_twenty_highest_grossing_games1.sort_index(ascending=False).head(20)["Name"]
top_twenty_highest_grossing_games

Global_Sales
82.74                                      Wii Sports
40.24                               Super Mario Bros.
35.82                                  Mario Kart Wii
33.00                               Wii Sports Resort
31.37                        Pokemon Red/Pokemon Blue
30.26                                          Tetris
30.01                           New Super Mario Bros.
29.02                                        Wii Play
28.62                       New Super Mario Bros. Wii
28.31                                       Duck Hunt
24.76                                      Nintendogs
23.42                                   Mario Kart DS
23.10                     Pokemon Gold/Pokemon Silver
22.72                                         Wii Fit
22.00                                    Wii Fit Plus
21.82                              Kinect Adventures!
21.40                              Grand Theft Auto V
20.81                   Grand Theft Auto: San Andreas
20.61          

## For North American video game sales, what’s the median?
- Provide a secondary output showing ten games surrounding the median sales output
    - assume that games with same median value are sorted in descending order

In [7]:
na_median_sales = df["NA_Sales"].median()
around_median = df[ df["NA_Sales"] ==na_median_sales]
ten_median_na_seller_names =around_median.head(10)["Name"]
ten_median_na_seller_names

446                                     Dragon Warrior IV
497           World Soccer Winning Eleven 7 International
1617                               Farming Simulator 2015
1926                            Pro Evolution Soccer 2008
2067    Winning Eleven: Pro Evolution Soccer 2007 (All...
2373                             Phantasy Star Portable 2
2579                                 The Sims 2: Castaway
3186                                       SingStar Queen
3503                                           Top Spin 3
3703                 Sonic & All-Stars Racing Transformed
Name: Name, dtype: object

## For the top-selling game of all time, how many standard deviations above/below the mean are its sales for North America?

In [8]:
mean = df["NA_Sales"].mean()
top_selling = df["NA_Sales"].iloc[0]
selling_std = df["NA_Sales"].std()
na_std = (top_selling-mean) / selling_std
na_std

50.47898767479108

## The Nintendo Wii seems to have outdone itself with games. How does its average number of sales compare with all of the other platforms?

In [9]:
nintendo_wii = df.groupby(df["Platform"] == "Wii")["Global_Sales"].mean()
num_of_sales =(nintendo_wii[0],nintendo_wii[1])
num_of_sales

(0.5233896418516336, 0.6994037735849057)

## Come up with 3 more questions that can be answered with this data set.

### What’s the newest game ?

In [10]:
newest_one = df['Year']
newest_game=newest_one.value_counts().sort_index(ascending=False).idxmin()
newest_game

2020.0

### what is the mean for europ sales?

In [11]:
eu_mean_sales = df["EU_Sales"].mean()
eu_mean_sales

0.14665200626581515

### what is the the range for JP_Sales ?

In [12]:
jp_max_sales = df["EU_Sales"].max()
jp_min_sales = df["EU_Sales"].min()

jp_range_sales =jp_max_sales-jp_min_sales
jp_range_sales

29.02

In [13]:
def test():

    def assert_equal(actual,expected):
        assert actual == expected, f"Expected {expected} but got {actual}"
    assert_equal(most_common_publisher, 'Electronic Arts')
    assert_equal(most_common_platform, 'DS')
    assert_equal(most_common_genre, 'Action')
    assert_equal(top_twenty_highest_grossing_games.iloc[0], "Wii Sports")
    assert_equal(top_twenty_highest_grossing_games.iloc[19], "Brain Age: Train Your Brain in Minutes a Day")
    assert_equal(na_median_sales, 0.08)
    assert_equal(na_std,50.47898767479108)
    assert_equal(num_of_sales[0], 0.5233896418516336)
    assert_equal(num_of_sales[1], 0.6994037735849057)
    assert_equal(newest_game,2020.0)
    assert_equal(eu_mean_sales,0.14665200626581515)
    assert_equal(jp_range_sales,29.02)
    print("Success!!!")

test()

Success!!!
