<a href="https://colab.research.google.com/github/rj-adity/Wingo/blob/main/Wingo1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [105]:
import pandas as pd

# Provide the raw URL of the CSV file hosted on GitHub
url = 'https://raw.githubusercontent.com/rj-adity/Wingo/main/wingo.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(url)

# Display the DataFrame
print(df)

          id         period  amount     game  status           time
0       1520  2022070110000       6  wingo10       1  1661533423187
1       1521  2022070110001       5  wingo10       1  1661533423187
2       1522  2022070110000       6   wingo5       1  1661533423187
3       1523  2022070110001       4   wingo5       1  1661533423187
4       1524  2022070110000       6   wingo3       1  1661533423187
...      ...            ...     ...      ...     ...            ...
13580  15100  2022070112771       0   wingo3       0  1665538200598
13581  15101  2022070118319       8    wingo       1  1665538200598
13582  15102  2022070111659       0   wingo5       0  1665538200602
13583  15103  2022070110831       0  wingo10       0  1665538200601
13584  15104  2022070118320       0    wingo       0  1665538260666

[13585 rows x 6 columns]


# **Preprocessing the data!!**


In [106]:
#display the first few rows of the dataset
print(df.head())

#Checking the data types of each column
print(df.dtypes)

#Get information about the dataset
print(df.info())

# Summary statistics of numerical columns
print(df.describe())

# Convert 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'], unit='ms')

# Extract features from 'time' column
df['hour'] = df['time'].dt.hour
df['minute'] = df['time'].dt.minute
df['second'] = df['time'].dt.second

# Drop the original 'time' column
df.drop(columns=['time'], inplace=True)


     id         period  amount     game  status           time
0  1520  2022070110000       6  wingo10       1  1661533423187
1  1521  2022070110001       5  wingo10       1  1661533423187
2  1522  2022070110000       6   wingo5       1  1661533423187
3  1523  2022070110001       4   wingo5       1  1661533423187
4  1524  2022070110000       6   wingo3       1  1661533423187
id         int64
period     int64
amount     int64
game      object
status     int64
time       int64
dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13585 entries, 0 to 13584
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      13585 non-null  int64 
 1   period  13585 non-null  int64 
 2   amount  13585 non-null  int64 
 3   game    13585 non-null  object
 4   status  13585 non-null  int64 
 5   time    13585 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 636.9+ KB
None
                 id        period        amount      

In [107]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Define features (X) and target variable (y)
X = df.drop(['period'], axis=1)  # Drop 'period' since it's not needed for prediction
y = df['status']




In [112]:
# Perform one-hot encoding for the 'game' column
df_encoded = pd.get_dummies(df, columns=['game'])

# Check the first few rows of the encoded DataFrame
print(df_encoded.head())

X = df_encoded.drop(['status'], axis=1)
y = df_encoded['status']

     id         period  amount  status  hour  minute  second  game_wingo  \
0  1520  2022070110000       6       1    17       3      43           0   
1  1521  2022070110001       5       1    17       3      43           0   
2  1522  2022070110000       6       1    17       3      43           0   
3  1523  2022070110001       4       1    17       3      43           0   
4  1524  2022070110000       6       1    17       3      43           0   

   game_wingo10  game_wingo3  game_wingo5  
0             1            0            0  
1             1            0            0  
2             0            0            1  
3             0            0            1  
4             0            1            0  


In [113]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [114]:
# Check for non-numeric columns
non_numeric_columns = X_train.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index([], dtype='object')


# **Random Forest**

In [115]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

In [116]:
# Check data types of all columns
print(X_train.dtypes)

id              int64
period          int64
amount          int64
hour            int64
minute          int64
second          int64
game_wingo      uint8
game_wingo10    uint8
game_wingo3     uint8
game_wingo5     uint8
dtype: object


In [117]:
# Make predictions on the testing set
predictions = rf_model.predict(X_test)

In [119]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [121]:
# Make predictions on the training set
train_predictions = rf_model.predict(X_train)

# Make predictions on the testing set
test_predictions = rf_model.predict(X_test)
# Evaluate the model's performance on the training and testing sets
train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

Training Accuracy: 1.0
Testing Accuracy: 1.0


In [122]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print("Cross-Validation Scores:", scores)
print("Average Cross-Validation Score:", scores.mean())



Cross-Validation Scores: [0.99954002 0.99954002 0.99954002 1.         1.        ]
Average Cross-Validation Score: 0.9997240110395584


In [123]:
from sklearn.model_selection import StratifiedKFold

# Perform 5-fold stratified cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf_model, X_train, y_train, cv=skf)
print("Stratified Cross-Validation Scores:", scores)
print("Average Stratified Cross-Validation Score:", scores.mean())



Stratified Cross-Validation Scores: [0.99954002 0.99954002 0.99954002 1.         1.        ]
Average Stratified Cross-Validation Score: 0.9997240110395584
