In [10]:
!pip install pandas numpy scikit-learn mlxtend




In [12]:
import pandas as pd

# Load the dataset into a DataFrame
data = pd.read_csv("Video_Games_Sales_as_at_22_Dec_2016.csv")

# Preview the DataFrame
data.head()


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [13]:
print(data.columns)


Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score',
       'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating'],
      dtype='object')


In [14]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

# Handle missing values (example: drop rows with missing values)
data = data.dropna()


Name                  2
Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Developer          6623
Rating             6769
dtype: int64


In [15]:
data = pd.get_dummies(data)


In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)


In [17]:
from sklearn.cluster import KMeans

# Initialize a KMeans model with k=3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)

# Fit the model to the data
kmeans.fit(data_scaled)

# Get cluster labels for each data point
cluster_labels = kmeans.labels_

# Add cluster labels to the DataFrame
data['Cluster'] = cluster_labels

# Analyze the clusters
print(data['Cluster'].value_counts())




Cluster
1    6820
2       4
0       1
Name: count, dtype: int64


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Assuming 'Global_Sales' is the correct target column
X = data.drop('Global_Sales', axis=1)
y = data['Global_Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Squared Error: 695198952.5049715
R-squared: -255807746.9907316


In [19]:
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder

# Create a list of transactions
transactions = data.applymap(str).values.tolist()

# Apply apriori algorithm
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df, min_support=0.02, use_colnames=True)

# Analyze the frequent itemsets
print(frequent_itemsets)


       support                             itemsets
0     0.752674                                (0.0)
1     0.353993                               (0.01)
2     0.267692                               (0.02)
3     0.202784                               (0.03)
4     0.172308                               (0.04)
...        ...                                  ...
2426  0.023004    (0.02, True, 1, 0.0, False, 0.09)
2427  0.022857  (0.02, True, 1, 0.0, False, 2005.0)
2428  0.021392  (0.02, True, 1, 0.0, False, 2006.0)
2429  0.023736    (0.03, True, 0.04, 1, False, 0.0)
2430  0.022418    (0.05, True, 0.04, 1, False, 0.0)

[2431 rows x 2 columns]
